Ejemplo n.º 1
0
def f1_average(y_true, y_pred):
    """
    returns average of f1 score for both classes.
    """
    f1_survived = f1_score(y_true, y_pred, pos_label=1, average="binary")
    f1_died = f1_score(y_true, y_pred, pos_label=0, average="binary")
    return np.mean([f1_survived, f1_died])
Ejemplo n.º 2
0
def test_auto_weight():
    # Test class weights for imbalanced data
    from sklearn.linear_model import LogisticRegression
    # We take as dataset the two-dimensional projection of iris so
    # that it is not separable and remove half of predictors from
    # class 1.
    # We add one to the targets as a non-regression test: class_weight="balanced"
    # used to work only when the labels where a range [0..K).
    from sklearn.utils import compute_class_weight
    X, y = iris.data[:, :2], iris.target + 1
    unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2])

    classes = np.unique(y[unbalanced])
    class_weights = compute_class_weight('balanced', classes, y[unbalanced])
    assert_true(np.argmax(class_weights) == 2)

    for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0),
                LogisticRegression()):
        # check that score is better when class='balanced' is set.
        y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
        clf.set_params(class_weight='balanced')
        y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],).predict(X)
        assert_true(metrics.f1_score(y, y_pred, average='weighted')
                    <= metrics.f1_score(y, y_pred_balanced,
                                        average='weighted'))
Ejemplo n.º 3
0
Archivo: model.py Proyecto: guker/mlp-1
 def f1(self, X, y):
     n_class = len(np.unique(y))
     prediction = self.predict(X)
     if n_class > 2:
         return f1_score(y, prediction, average='weighted')
     else:
         return f1_score(y, prediction)
Ejemplo n.º 4
0
def evaluation(y_test=None, y_predict=None, n_classes=None):
    """
    Input the predicted results, targets results and
    the number of class, return the confusion matrix, F1-score of each class,
    accuracy and macro F1-score.

    Parameters
    ----------
    y_test : list
        The target results
    y_predict : list
        The predicted results
    n_classes : int
        The number of classes

    Examples
    --------
    >>> c_mat, f1, acc, f1_macro = tl.utils.evaluation(y_test, y_predict, n_classes)

    """
    c_mat = confusion_matrix(y_test, y_predict, labels=[x for x in range(n_classes)])
    f1 = f1_score(y_test, y_predict, average=None, labels=[x for x in range(n_classes)])
    f1_macro = f1_score(y_test, y_predict, average='macro')
    acc = accuracy_score(y_test, y_predict)
    tl.logging.info('confusion matrix: \n%s' % c_mat)
    tl.logging.info('f1-score        : %s' % f1)
    tl.logging.info('f1-score(macro) : %f' % f1_macro)  # same output with > f1_score(y_true, y_pred, average='macro')
    tl.logging.info('accuracy-score  : %f' % acc)
    return c_mat, f1, acc, f1_macro
Ejemplo n.º 5
0
def compute_ref(true_tags, out_file, data_type='svm_light'):
    tag_map = {'OK': 1, 'BAD': 0, u'OK': 1, u'BAD': 0}
    predicted = []
    if data_type == 'svm_light':
        tag_map_pred = {'+1': 1, '-1': 0}
        for line in open(out_file):
            label = line[line.find(':')+1:line.find(' ')]
            predicted.append(tag_map_pred[label])
    elif data_type == 'crfpp' or data_type == 'crf_suite':
        for line in open(out_file):
            line = line.strip('\n')
            if line == '':
                continue
            tag = line.split('\t')[-1]
            if tag == 'OK' or tag == 'BAD':
                predicted.append(tag)
        predicted = [tag_map[t] for t in predicted]
#    if (type(true_tags[0]) is str or type(true_tags[0]) is unicode) and not true_tags[0].isdigit():
    true_tags = [tag_map[t] for t in true_tags]
#    if type(predicted[0]) is str and not predicted[0].isdigit():
    print(true_tags[:10])
    print(predicted[:10])

    print(f1_score(predicted, true_tags, average=None))
    print(f1_score(predicted, true_tags, average='weighted', pos_label=None))
Ejemplo n.º 6
0
def get_f1_and_classification_report(embeddings_dict, classifier):
    xs, ys, y_pred = get_xs_ys_predictions(embeddings_dict, classifier)
    class_names = ['verbs', 'nouns', 'adjectives', 'closed class words']
    report = classification_report(y_true=ys, y_pred=y_pred, target_names=class_names)
    micro_f1 = f1_score(y_true=ys, y_pred=y_pred, average='micro')
    macro_f1 = f1_score(y_true=ys, y_pred=y_pred, average='macro')
    return micro_f1, macro_f1, report
Ejemplo n.º 7
0
def evaluation(y_test=None, y_predict=None, n_classes=None):
    """
    Input the predicted results, targets results and
    the number of class, return the confusion matrix, F1-score of each class,
    accuracy and macro F1-score.

    Parameters
    ----------
    y_test : numpy.array or list
        target results
    y_predict : numpy.array or list
        predicted results
    n_classes : int
        number of classes

    Examples
    --------
    >>> c_mat, f1, acc, f1_macro = evaluation(y_test, y_predict, n_classes)
    """
    from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
    c_mat = confusion_matrix(y_test, y_predict, labels = [x for x in range(n_classes)])
    f1    = f1_score(y_test, y_predict, average = None, labels = [x for x in range(n_classes)])
    f1_macro = f1_score(y_test, y_predict, average='macro')
    acc   = accuracy_score(y_test, y_predict)
    print('confusion matrix: \n',c_mat)
    print('f1-score:',f1)
    print('f1-score(macro):',f1_macro)   # same output with > f1_score(y_true, y_pred, average='macro')
    print('accuracy-score:', acc)
    return c_mat, f1, acc, f1_macro
Ejemplo n.º 8
0
def benchmark(clf, train_X, train_y, test_X, test_y, encoder):
    """
    benchmark based on f1 score
    """
    t0 = time()
    clf.fit(train_X, train_y)
    train_time = time() - t0

    t0 = time()
    pred = clf.predict(test_X)
    test_time = time() - t0

    score = metrics.f1_score(test_y, pred, average='micro')
    scores = metrics.f1_score(test_y, pred, average=None)
    counter = Counter(train_y)
    counter = [(k, v) for k, v in counter.iteritems()]
    counter.sort(key=lambda a: a[1], reverse=True)
    if len(counter) > 20:
        tops = [v[0] for v in counter[0:20]]
    else:
        tops = [v[0] for v in counter]
    labels = encoder.inverse_transform(tops)
    s = [scores[v] for v in tops]
    labeled_scores = zip(labels, s)

    return clf, score, labeled_scores, train_time, test_time
Ejemplo n.º 9
0
    def on_epoch_end(self, epoch, logs={}):
        print logs

        corr=0
        tot=0
        preds = self.model.predict(self.dev_data, verbose=1)
        preds_text=[]
        for l in preds:
            preds_text.append(self.index2label[np.argmax(l)])

        print "Micro f-score:", f1_score(self.dev_labels_text,preds_text,average=u"micro")
        print "Macro f-score:", f1_score(self.dev_labels_text,preds_text,average=u"macro")
        print "Macro recall:", recall_score(self.dev_labels_text,preds_text,average=u"macro")

        if self.best_mr < recall_score(self.dev_labels_text,preds_text,average=u"macro"):
            self.best_mr = recall_score(self.dev_labels_text,preds_text,average=u"macro")
            model.save_weights(self.model_name + '_full_' + str(epoch) + '_MR_' + str(self.best_mr) + '.hdf5')
            print 'Saved Weights!'


        print classification_report(self.dev_labels_text, preds_text)
        for i in xrange(len(self.dev_labels)):

        #    next_index = sample(preds[i])
            next_index = np.argmax(preds[i])
            # print preds[i],next_index,index2label[next_index]

            l = self.index2label[next_index]

            # print "correct:", index2label[np.argmax(dev_labels[i])], "predicted:",l
            if self.index2label[np.argmax(self.dev_labels[i])]==l:
                corr+=1
            tot+=1
        print corr,"/",tot
def predict_evaluate_models(fn ,ax=None, sel=["Penalties_Conceeded","Tries_Scored"], goal="Referee", verbosity=0):
    class_weight = 'auto'
    X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1)
    if verbosity > 2:
        y_shuffled = y.copy()
        np.random.shuffle(y_shuffled)
        print ("All zeros accuracy:",1.0-np.sum(y)/len(y)) 
        print ("y_shuffled f1_csore:",metrics.f1_score(y, y_shuffled))

    n_folds = 10
    cv = cross_validation.StratifiedKFold(y, n_folds=n_folds)
    #cv = cross_validation.LeaveOneOut(n=len(y))
    results = []
    for sclf in ('svm','svmp','svmr','lgCV','gnb','rf','knc'):
        clf = get_clf(sclf,class_weight=class_weight)
        y_pred = cross_validation.cross_val_predict(clf, X, y, cv=cv)
        #print "pred:",y_pred
        res = [
            metrics.accuracy_score(y, y_pred),
            metrics.precision_score(y, y_pred),
            metrics.recall_score(y, y_pred),
            metrics.f1_score(y, y_pred),
            ]
        if verbosity > 0:
            print (sclf,res) 
        results.append( (sclf,res) )

    return results
Ejemplo n.º 11
0
    def compare_2_models(model1, model2, X, y, h):
        h = min(X.shape[0], h)
        hidden_layer = features[np.random.choice(X.shape[0],
                                                 h,
                                                 replace=False)]
        print('training 1st model')
        pr = cProfile.Profile()
        pr.enable()
        model1.fit(X, y, hidden_layer=hidden_layer)
        y1 = model1.predict(X)
        pr.disable()
        ps = pstats.Stats(pr).sort_stats('cumulative')
        ps.print_stats()

        print('training 2nd model')
        pr = cProfile.Profile()
        pr.enable()
        model2.fit(X, y, hidden_layer=hidden_layer)
        y2 = model2.predict(X)
        pr.disable()
        ps = pstats.Stats(pr).sort_stats('cumulative')
        ps.print_stats()

        print(f1_score(y, y2))
        print(f1_score(y, y1))

        return np.allclose(y1, y2)
Ejemplo n.º 12
0
def baseline_graph_experiment(model, data_fn, data_name, model_name):
    print "Running graph experiment (%s)..." % (data_name,)

    A, X, Y = data_fn()

    A = np.asarray(A)
    X = np.asarray(X)
    Y = np.asarray(Y)

    n_nodes = A.shape[0]

    indices = np.arange(n_nodes)
    np.random.shuffle(indices)

    train_indices = indices[: n_nodes // 3]
    valid_indices = indices[n_nodes // 3 : (2 * n_nodes) // 3]
    test_indices = indices[(2 * n_nodes) // 3 :]

    model.fit_with_validation(A, X, Y, train_indices, valid_indices)

    preds = model.predict(A, X, test_indices)
    actuals = Y[test_indices, :]

    accuracy = accuracy_score(actuals, preds)
    f1_micro = f1_score(actuals, preds, average="micro")
    f1_macro = f1_score(actuals, preds, average="macro")

    print "form: name,micro_f,macro_f,accuracy"
    print "###RESULTS###: %s,%s,%.8f,%.8f,%.8f" % (data_name, model_name, f1_micro, f1_macro, accuracy)
Ejemplo n.º 13
0
def getScores(y, yPredTrain, yTest, yPredTest):

    scores = dict()

    scores['f1Train'] = f1_score(y, yPredTrain)
    scores['f1Test'] = f1_score(yTest, yPredTest)


    scores['accTrain'] = accuracy_score(y, yPredTrain)
    scores['accTest'] = accuracy_score(yTest, yPredTest)
    

    scores['rocTrain'] = roc_auc_score(y, yPredTrain)
    scores['rocTest'] = roc_auc_score(yTest, yPredTest)
    

    scores['cMatrixTrain'] = confusion_matrix(y, yPredTrain)
    scores['cMatrixTest'] = confusion_matrix(yTest, yPredTest)

    proba = float(len(np.where(y==1)[0]))/len(y)
    if proba < 0.50:
        proba = 1 - proba
    scores['random'] = proba
    
    return scores
def main():
	f = open("me.stdout", "r").read()

	print f
	
	(confusionMatrix, labels, ytrue, ypred, trueCount) = readConfusionMatrix.readText(f)
	for row in confusionMatrix:
		print row

	precisionMicro = np.float(metrics.precision_score(ytrue, ypred, average="micro"))
	recallMicro = np.float(metrics.recall_score(ytrue, ypred, average="micro"))
	f1Micro = np.float(metrics.f1_score(ytrue, ypred, average="micro"))
	f1Macro = np.float(metrics.f1_score(ytrue, ypred, pos_label=1, average="macro"))
	precisionMacro = np.float(metrics.precision_score(ytrue, ypred, average="macro"))
	recallMacro = np.float(metrics.recall_score(ytrue, ypred, average="macro"))

	mConf = metrics.confusion_matrix(ytrue, ypred)
	print mConf

	print labels
	print len(ytrue)
	print len(ypred)
	print trueCount

	print metrics.accuracy_score(ytrue, ypred)

	print precisionMicro
	print recallMicro
	print f1Micro
	print f1Macro
	print precisionMacro
	print recallMacro
Ejemplo n.º 15
0
def kernel_graph_experiment(model, data_fn, data_name, model_name):
    print "Running graph experiment (%s)..." % (data_name,)

    A, X, Y = data_fn()

    n_nodes = len(A)

    indices = np.arange(n_nodes)
    np.random.shuffle(indices)

    print indices

    train_indices = indices[: n_nodes // 3]
    valid_indices = indices[n_nodes // 3 : (2 * n_nodes) // 3]
    test_indices = indices[(2 * n_nodes) // 3 :]
    # train_indices = indices[:int(n_nodes*0.8)]
    # valid_indices = indices[int(n_nodes*0.8):int(n_nodes*0.9)]
    # test_indices = indices[int(n_nodes*0.9):]

    model.fit_with_validation(Y, train_indices, valid_indices, test_indices)

    preds = model.predict(Y, np.asarray([]), test_indices)
    actuals = Y[test_indices, :]

    accuracy = accuracy_score(actuals, preds)
    f1_micro = f1_score(actuals, preds, average="micro")
    f1_macro = f1_score(actuals, preds, average="macro")

    print "form: name,micro_f,macro_f,accuracy"
    print "###RESULTS###: %s,%s,%.8f,%.8f,%.8f" % (data_name, model_name, f1_micro, f1_macro, accuracy)
Ejemplo n.º 16
0
def single_test(feature, attribute):
    from sklearn.metrics import f1_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import accuracy_score
    from data_generator import load_vector_from_text
    import random
    data=merge_different_vectors([feature],attribute)
    none_attribute_uids=load_vector_from_text('uids_none_attributes.vector',feature,'list')
    none_attribute_uids=filter(lambda x:x in data[0],none_attribute_uids)
    alpha=0.2*len(data[0])/len(none_attribute_uids)
    train_data=[[],[]]
    test_data=[[],[]]
    for index,uid in enumerate(data[0]):
        if uid in none_attribute_uids and random.random()<alpha:
        #if random.random()<0.2:
            test_data[0].append(data[1][index])
            test_data[1].append(data[2][index])
        else:
            train_data[0].append(data[1][index])
            train_data[1].append(data[2][index])
    print len(test_data[1]),sum(test_data[1]),len(train_data[1]),sum(train_data[1])
    clf=LogisticRegression()
    clf.fit(train_data[0], train_data[1])
    predicted_y=clf.predict(test_data[0])
    test_accuracy=accuracy_score(test_data[1],predicted_y)
    test_recall=recall_score(test_data[1],predicted_y)
    test_f1=f1_score(test_data[1],predicted_y)
    print 'F1 of test data (%d %d): %0.2f'%(sum(test_data[1]),len(test_data[1])-sum(test_data[1]),test_f1)
    print 'Accuracy of test data (%d %d): %0.2f'%(sum(test_data[1]),len(test_data[1])-sum(test_data[1]),test_accuracy)
    predicted_y=clf.predict(train_data[0])
    train_accuracy=accuracy_score(train_data[1],predicted_y)
    train_recall=recall_score(train_data[1],predicted_y)
    train_f1=f1_score(train_data[1],predicted_y)
    print 'F1 of train data (%d %d): %0.2f'%(sum(train_data[1]),len(train_data[1])-sum(train_data[1]),train_f1)
    return [test_accuracy,test_recall,test_f1,train_accuracy,train_recall,train_f1]
Ejemplo n.º 17
0
def benchmark(clf_current):
    print('_' * 80)
    print("Test performance for: ")
    clf_descr = str(clf_current).split('(')[0]
    print(clf_descr)
    t0 = time()
    classif = OneVsRestClassifier(clf_current)
    classif.fit(X_train, Y_train.toarray())
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    t0 = time()
    if hasattr(clf_current,"decision_function"):
        dfmatrix = classif.decision_function(X_test)
        score = metrics.f1_score(Y_test.toarray(), df_to_preds(dfmatrix, k = 5))
    else:
        probsmatrix = classif.predict_proba(X_test)
        score = metrics.f1_score(Y_test.toarray(), probs_to_preds(probsmatrix, k = 5))
        
    test_time = time() - t0

    
    print("f1-score:   %0.7f" % score)
    print("test time:  %0.3fs" % test_time)

    print('_' * 80)
    return clf_descr, score, train_time, test_time
Ejemplo n.º 18
0
    def findBestDistance(self):
        print '*** start ****'
        d = 0.1
#         y_true = []
#         y_pred = []
        result = {}
        for x in range(0,10):
            y_true = []
            y_pred = []
            for dataIndex in range(0, len(self.lstTest)):
                dataTest = self.lstTest[dataIndex]
#                 y_true.append(dataTest[0])
                y_true.append(self.scoreConvert(dataTest[0]))
                isFilter = self.computeWithNoCorpus(dataTest[1], self.lstTrain, d)
                y_pred.append(self.scoreConvert(isFilter))
            print y_true
            print y_pred
            f1 = metrics.f1_score(y_true, y_pred)
            f1_mac = f1_score(y_true, y_pred, average='macro') 
            print 'd : ',d,' f1 : ',f1,' f1 mac : ',f1_mac
            result[d] = f1
            print classification_report(y_true, y_pred)
#             print 'result ', result
            d = d+0.1
        print result
        print '*** end ******'
Ejemplo n.º 19
0
def ternary_metrics(polarities, lexicon, eval_words, tau_lexicon=None):
    if not tau_lexicon == None:
        kendall_words = list(set(eval_words).intersection(tau_lexicon))
    y_prob, y_true = [], []
    polarities = {word:polarities[word] for word in eval_words}
    for w in polarities:
        y_prob.append(polarities[w])
        y_true.append(lexicon[w])
    y_prob = np.array(y_prob)
    y_true = np.array(y_true)
    y_prob = 2*(y_prob - np.min(y_prob)) / (np.max(y_prob) - np.min(y_prob)) - 1
    neg_prop = np.sum(np.array(lexicon.values()) == -1) / float(len(lexicon))
    pos_prop = np.sum(np.array(lexicon.values()) == 1) / float(len(lexicon))
    sorted_probs = sorted(y_prob)
    neg_thresh = sorted_probs[int(np.round(neg_prop*len(sorted_probs)))]
    pos_thresh = sorted_probs[-int(np.round(pos_prop*len(sorted_probs)))]
    cmn_labels = [1 if val >= pos_thresh else -1 if val <= neg_thresh else 0 for val in y_prob]
    if not tau_lexicon == None:
        tau = kendalltau(*zip(*[(polarities[word], tau_lexicon[word]) for word in kendall_words]))[0]
    else:
        tau = None
    maj_f1 = f1_score(y_true, np.repeat(sp.stats.mode(y_true)[0][0], len(y_true)), average="macro")
    cmn_f1 = f1_score(y_true, cmn_labels, average="macro")
    label_func = lambda entry : 1 if entry > pos_thresh else -1 if entry < neg_thresh else 0
    conf_mat = confusion_matrix(y_true, [label_func(entry) for entry in y_prob])
    return tau, cmn_f1, maj_f1, conf_mat
Ejemplo n.º 20
0
def compareModels(model):
    """
    This evaluates the pre-trained model agaisnt
    metamind's API on sentences in `data/validation`

    Parameters
    ----------
    model: test.MODEL
        Namedtuple containing model parameters (dictionary, tfidf
        learner and labels)

    """
    
    set_api_key("MohJ53r6kUvoPjHS8tStX1vnfssvN5EDetVcp2uCNISwXus2BS")

    with open('data/validation', 'r') as fin:
        validations = fin.read()
        truth = [model.labels.label2class[i] for i in
                 ['positive']*9 + ['negative']*8]          

    scores_mm = []
    scores_joe = []
    for validation in validations.split('\n'):
        mmLabel = testMetaMind(validation)[0]['label']
        scores_mm.append(model.labels.label2class[mmLabel])
        joeLabel = testDeepModel(validation, model)
        scores_joe.append(model.labels.label2class[joeLabel])
        
    print 'MetaMind F1 score is %s' % f1_score(truth, scores_mm)
    print 'My F1 score is %s' % f1_score(truth, scores_joe)
Ejemplo n.º 21
0
def test_standard_svm_blobs_2d_class_weight():
    # no edges, reduce to crammer-singer svm
    X, Y = make_blobs(n_samples=210, centers=3, random_state=1, cluster_std=3,
                      shuffle=False)
    X = np.hstack([X, np.ones((X.shape[0], 1))])
    X, Y = X[:170], Y[:170]

    X_graphs = [(x[np.newaxis, :], np.empty((0, 2), dtype=np.int)) for x in X]

    pbl = GraphCRF(n_features=3, n_states=3, inference_method='unary')
    svm = OneSlackSSVM(pbl, check_constraints=False, C=1000)

    svm.fit(X_graphs, Y[:, np.newaxis])

    weights = 1. / np.bincount(Y)
    weights *= len(weights) / np.sum(weights)

    pbl_class_weight = GraphCRF(n_features=3, n_states=3, class_weight=weights,
                                inference_method='unary')
    svm_class_weight = OneSlackSSVM(pbl_class_weight, C=10,
                                    check_constraints=False,
                                    break_on_bad=False)
    svm_class_weight.fit(X_graphs, Y[:, np.newaxis])

    assert_greater(f1_score(Y, np.hstack(svm_class_weight.predict(X_graphs))),
                   f1_score(Y, np.hstack(svm.predict(X_graphs))))
Ejemplo n.º 22
0
 def cutoff_f1(clf, X, y):
     y_pred = (clf.predict_proba(X)[:,1] > cutoff_value).astype(int)
     y_pred2 = clf.predict(X)
     s1 = f1_score(y, y_pred)
     s2 = f1_score(y, y_pred2)
     # print 'f1 = %.4f, %.4f' % (s1, s2)
     return s1
Ejemplo n.º 23
0
 def on_epoch_end(self, batch, logs={}):
     # losses
     self.losses_train.append(self.model.evaluate(X_train, Y_train, batch_size=128,verbose =0))
     self.losses_val.append(self.model.evaluate(X_val, Y_val, batch_size=128,verbose = 0))
     
     # Roc train
     train_preds = self.model.predict_proba(X_train, verbose=0)
     train_preds = train_preds[:, 1]
     roc_train = metrics.roc_auc_score(y_train, train_preds)
     self.roc_train.append(roc_train)
     
     # Roc val
     val_preds = self.model.predict_proba(X_val, verbose=0)
     val_preds = val_preds[:, 1]
     roc_val = metrics.roc_auc_score(y_val, val_preds)
     self.roc_val.append(roc_val)
     
     # Metrics train
     y_preds = self.model.predict_classes(X_train,verbose = 0)
     self.f1_train.append(metrics.f1_score(y_train,y_preds))
     self.recal_train.append(metrics.recall_score(y_train,y_preds))
     self.preci_train.append(metrics.precision_score(y_train,y_preds))
     
     # Metrics val
     y_preds = self.model.predict_classes(X_val,verbose =0)
     self.f1_val.append(metrics.f1_score(y_val,y_preds))
     self.recal_val.append(metrics.recall_score(y_val,y_preds))
     self.preci_val.append(metrics.precision_score(y_val,y_preds))
def cv_model():
    DATA_FILE  = './data/train-set-ru-b64-utf-8.txt'
    all_data = []
    target = []
    with open(DATA_FILE) as df:
        for i, line in enumerate(df):
            print i
            line = line.strip()
            parts = line.split()
            stats_collector = StatsCollector()
            #print parts[2]
            #print base64.b64decode(parts[3])#.decode('utf-8')
            #print parts[2].decode('utf-8'), parts[3].decode('utf-8'), "\n"
            stats_collector.collect(int(parts[1]), parts[3], parts[2])
            # mark page url
            all_data.append(stats_collector.get_features())
            target.append(stats_collector.get_target())
            #print all_data[-1]

    data = np.asarray(all_data, dtype = np.float)
    target = np.asarray(target, dtype = np.float)

    clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.05, n_estimators=400,\
     min_samples_split=30, min_samples_leaf=15, max_depth=5)

    kf = KFold(data.shape[0], n_folds = 3, shuffle = True)

    for train_index, test_index in kf:
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print f1_score(y_test, y_pred)
Ejemplo n.º 25
0
def cross_val(data_x, data_y, classifier, kFold, b_cost=1, h_cost=1, w=0.5):
    e_h, e_b = 0, 0
    y_tests, pred_probas = [], []
    
    for train_index, test_index in kFold:
        data_x_, data_y_ = np.array(data_x), np.array(data_y)
        X_train, X_test = list(data_x_[train_index]), list(data_x_[test_index])
        y_train, y_test = list(data_y_[train_index]), list(data_y_[test_index])
        classifier.fit(X_train, y_train)
        pred_proba = [r[0] for r in classifier.predict_proba(X_test)]
        y_tests += y_test
        pred_probas += pred_proba
    
    predictions = [0 if p*b_cost > (1-p)*h_cost else 1 for p in pred_probas]
    roc_auc = roc_auc_score(y_tests, pred_probas)
    total_acc = accuracy_score(y_tests, predictions)
    precision, recall, thresholds = precision_recall_curve(y_tests, pred_probas, pos_label=0)
    fpr, tpr, thresholds = roc_curve(y_tests, pred_probas, pos_label=0)
    precision_bots = precision_score(y_tests, predictions, pos_label = 0)
    precision_humans = precision_score(y_tests, predictions, pos_label = 1)
    recall_bots = recall_score(y_tests, predictions, pos_label = 0)
    recall_humans = recall_score(y_tests, predictions, pos_label = 1)
    f1_bots = f1_score(y_tests, predictions, pos_label = 0)
    f1_humans = f1_score(y_tests, predictions, pos_label = 1)
    conf_matrix = np.matrix(list(confusion_matrix(y_tests, predictions)))
    
    #plot_curve(fpr, tpr, 'ROC', w)
    #plot_curve(recall, precision, 'PR', w)
    
    return [total_acc, precision_bots, precision_humans, recall_bots, recall_humans, f1_bots, f1_humans, roc_auc, conf_matrix]
Ejemplo n.º 26
0
    def logistic_regression_sklearn(self, features, labels):
        """Run a logistic regression, evaluate it, return the LR object
        """

        print '\n**** Running logistic regression...'

        # Split into train / test segments
        features_train, features_test, target_train, target_test = cross_validation.train_test_split(features, labels, test_size=0.20, random_state=0)

        lr = LogisticRegression()
        lr.fit(features_train, target_train)

        # Evaluate the regression
        target_predicted = lr.predict(features_test)
        accuracy = accuracy_score(target_test, target_predicted)

        print 'Logistic regression accuracy score: {0:.0f}%'.format(100 * accuracy)

        # coefs = pd.DataFrame(zip(feature_cols, np.transpose(lr.coef_[0])), columns=['Feature', 'Coefficient'])

        print 'F1: ',
        print f1_score(target_test, target_predicted)

        # preds = lr.predict_proba(features_test)[:,1]
        # fpr, tpr, _ = roc_curve(target_test, preds)

        # print 'AOC: ',
        # print '{:.2f}'.format(auc(fpr,tpr))

        return lr
def fit_model():
    DATA_FILE  = './data/train-set-ru-b64-utf-8.txt'
    stats_collector = StatsCollector()
    i=0
    data = []
    target = []

    with open (DATA_FILE) as df:
         for i, line in enumerate(df):
            print i
            line = line.strip()
            parts = line.split()
            stats_collector = StatsCollector()
            stats_collector.collect(int(parts[1]), parts[3], parts[2])
            data.append(stats_collector.get_features())
            target.append(stats_collector.get_target())
            #print len(data[-1])


    data = np.asarray(data, dtype = np.float)
    target = np.asarray(target, dtype = np.float)
    print data.shape, target.shape
    df.close()
    clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.07, n_estimators=300, min_samples_split=30,\
         min_samples_leaf=15, max_depth=4)

    clf.fit(data, target)
    y_pred = clf.predict(data)
    print f1_score(target, y_pred)

    joblib.dump(clf, 'model/model.pkl') 
    def on_epoch_end(self, epoch, logs={}):
        print logs

        corr=0
        tot=0
        preds = self.model.predict(self.dev_data, verbose=1)

        preds_text=[]
        for l in preds:
            preds_text.append(self.index2label[np.argmax(l)])

        print "Micro f-score:", f1_score(self.dev_labels_text,preds_text,average=u"micro")
        print "Macro f-score:", f1_score(self.dev_labels_text,preds_text,average=u"macro")
        print classification_report(self.dev_labels_text, preds_text)

        for i in xrange(len(self.dev_labels)):

        #    next_index = sample(preds[i])
            next_index = np.argmax(preds[i])
            # print preds[i],next_index,index2label[next_index]

            l = self.index2label[next_index]

            # print "correct:", index2label[np.argmax(dev_labels[i])], "predicted:",l
            if self.index2label[np.argmax(self.dev_labels[i])]==l:
                corr+=1
            tot+=1
        print corr,"/",tot
Ejemplo n.º 29
0
def evaluate_fold(clf, X_train, y_train, X_test, y_test):
    """
    This is the business section
    """
    tmp = dict()
    tmp['X_train.shape'] = X_train.shape
    tmp['X_test.shape'] = X_test.shape
    try:
        pred_test = clf.predict_proba(X_test)
        pred_train = clf.predict_proba(X_train)
        tmp['roc'] = roc_info(y_test, pred_test[:,1])   
        tmp['roc_area'] = roc_auc_score(y_test, pred_test[:,1])
        pred_test = clf.predict(X_test)
        pred_train = clf.predict(X_train)
        tmp['f1_test'] = f1_score(y_test, pred_test, pos_label=1)        
        tmp['f1_train'] = f1_score(y_train, pred_train, pos_label=1) 

    except (AttributeError, NotImplementedError):
        pred_test = clf.predict(X_test)
        pred_train = clf.predict(X_train)
        tmp['roc'] = roc_info(y_test, pred_test)
        tmp['roc_area'] = roc_auc_score(y_test, pred_test)
        tmp['f1_test'] = f1_score(y_test, pred_test, pos_label=1)        
        tmp['f1_train'] = f1_score(y_train, pred_train, pos_label=1) 

    return tmp
Ejemplo n.º 30
0
def scnn_proportion_experiment(data_fn, name, n_hops, prop_valid, prop_test, transform_fn=util.rw_laplacian, transform_name='rwl'):
    print 'Running node experiment (%s)...' % (name,)

    A, X, Y = data_fn()

    n_nodes = A.shape[0]

    indices = np.arange(n_nodes)

    valid_start = int(n_nodes * (1 - (prop_valid + prop_test)))
    test_start = int(n_nodes * (1 - prop_test))

    valid_indices = indices[valid_start:test_start]
    test_indices  = indices[test_start:]

    for train_prop in [x / 10.0 for x in range(1, 11)]:
        train_end = int(valid_start * train_prop)
        train_indices = indices[:train_end]

        scnn = SCNN(n_hops=n_hops, transform_fn=transform_fn)
        scnn.fit(A, X, Y, train_indices=train_indices, valid_indices=valid_indices)

        probs = scnn.predict_proba(X, test_indices)
        print probs

        preds = scnn.predict(X, test_indices)
        actuals = np.argmax(Y[test_indices,:], axis=1)

        f1_micro = f1_score(actuals, preds, average='micro')
        f1_macro = f1_score(actuals, preds, average='macro')
        accuracy = accuracy_score(actuals, preds)

        print 'form: name,n_hops,transform_name,micro_f,macro_f,accuracy'
        print '###RESULTS###: %s,%d,%.2f,%s,%.8f,%.8f,%.8f' % (name, n_hops, train_prop, transform_name, f1_micro, f1_macro, accuracy)
Ejemplo n.º 31
0
    train, _ = ColumnInfoExtractor(n_files=num_files,
                                   n_rows=num_rows,
                                   train_size=1.,
                                   n_jobs=n_cores,
                                   column_sample=True).transform(
                                       annotations_file=train_file_path,
                                       csv_folder=csv_folder_path)

    test, _ = ColumnInfoExtractor(n_files=num_files,
                                  n_rows=num_rows,
                                  train_size=1.,
                                  n_jobs=n_cores,
                                  column_sample=True).transform(
                                      annotations_file=test_file_path,
                                      csv_folder=csv_folder_path)

    tqdm.write("Loading data done...")
    ablation_results = defaultdict(dict)
    for pp_name, pp in tqdm(all_pipelines.items()):
        tqdm.write(f"Fitting pipeline {pp_name}")
        pp.fit(train, train["y"])
        y_test = test["y"]
        y_pred = pp.predict(test)
        f_score = f1_score(y_true=y_test, y_pred=y_pred, average='macro')
        ablation_results[pp_name]["f_score"] = f_score
        ablation_results[pp_name]["confusion_matrix"] = confusion_matrix(
            y_true=y_test, y_pred=y_pred).tolist()
    ablation_results["tags"] = list(np.unique(test["y"]))
    json.dump(ablation_results, open("./data/ablation_results.json", "w"))
Ejemplo n.º 32
0
from sklearn.metrics import f1_score

y_true = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
y_pred = [0, 1, 1, 1, 1, 0, 0, 0, 1, 1]

print(f1_score(y_true, y_pred))
# 0.3636363636363636
Ejemplo n.º 33
0
Archivo: run.py Proyecto: jsusu/Emr_Ner
def train1():
    with open(opt.pickle_train_path, 'rb') as inp:
        word2id = pickle.load(inp)
        id2word = pickle.load(inp)
        tag2id = pickle.load(inp)
        id2tag = pickle.load(inp)
        x_train = pickle.load(inp)
        y_train = pickle.load(inp)
        x_valid = pickle.load(inp)
        y_valid = pickle.load(inp)

    print("train len:", len(x_train))
    print("valid len", len(x_valid))
    # print("test len", len(x_test))
    train_dataset = NERDataset(x_train,y_train)
    valid_dataset = NERDataset(x_valid, y_valid)
    # valid_dataset = NERDataset(x_valid, y_valid)
    # test_dataset = NERDataset(x_test, y_test)

    train_dataloader = DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers)
    valid_dataloader = DataLoader(valid_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers)
    # test_dataloader = DataLoader(test_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers)
    # x = train_dataset[0]
    # print(x)
    # for index, batch in enumerate(train_dataloader):
    #     print(index)
    #     print(batch)
    models = {'NERLSTM': NERLSTM,
              'NERLSTM_CRF': NERLSTM_CRF}
    all_vec = load_vec(opt.load_vec_path)
    # device = torch.device('cuda')
    # model = models[opt.model](opt.embedding_dim, opt.hidden_dim, opt.dropout, word2id, tag2id).cuda()
    model = models[opt.model](opt.word_dim, opt.embedding_dim, opt.hidden_dim, opt.filter_size, opt.cnn_out_dim,
                              opt.dropout, word2id, tag2id, all_vec).cuda()

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay)

    if opt.model == 'NERLSTM':
        for epoch in range(opt.max_epoch):
            model.train()
            for index, batch in enumerate(train_dataloader):
                optimizer.zero_grad()
                X = batch['x'].cuda()
                y = batch['y'].cuda()

                y = y.view(-1, 1)
                y = y.squeeze(-1)
                pred = model(X)
                pred = pred.view(-1, pred.size(-1))
                loss = criterion(pred, y)
                loss.backward()
                optimizer.step()
                if index % 200 == 0:
                    print('epoch:%04d,------------loss:%f' % (epoch, loss.item()))

            aver_loss = 0
            preds, labels = [], []
            for index, batch in enumerate(valid_dataloader):
                model.eval()
                val_x, val_y = batch['x'].cuda(), batch['y'].cuda()
                predict = model(val_x)
                predict = torch.argmax(predict, dim=-1)
                if index % 500 == 0:
                    print([id2word[i.item()] for i in val_x[0].cpu() if i.item() > 0])
                    length = [id2tag[i.item()] for i in val_y[0].cpu() if i.item() > 0]
                    print(length)
                    print([id2tag[i.item()] for i in predict[0][:len(length)].cpu() if i.item() > 0])

                # 统计非0的,也就是真实标签的长度
                leng = []
                for i in val_y.cpu():
                    tmp = []
                    for j in i:
                        if j.item() > 0:
                            tmp.append(j.item())
                    leng.append(tmp)

                # 提取真实长度的预测标签
                for index, i in enumerate(predict.tolist()):
                    preds.extend(i[:len(leng[index])])

                # 提取真实长度的真实标签
                for index, i in enumerate(val_y.tolist()):
                    labels.extend(i[:len(leng[index])])

            precision = precision_score(labels, preds, average='macro')
            recall = recall_score(labels, preds, average='macro')
            f1 = f1_score(labels, preds, average='macro')
            report = classification_report(labels, preds)
            print(report)
    elif opt.model == 'NERLSTM_CRF':
        best_score = 0.0
        for epoch in range(opt.max_epoch):
            model.train()
            for index, batch in enumerate(train_dataloader):
                optimizer.zero_grad()
                X = batch['x'].cuda()
                y = batch['y'].cuda()
                # CRF
                loss = model.log_likelihood(X, y)
                loss.backward()
                # CRF
                torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=10)

                optimizer.step()
                if index % 200 == 0:
                    print('best_score:%f' % (best_score))
                    print('epoch:%02d,idnex%4d------------loss:%f' % (epoch, index, loss.item()))

            aver_loss = 0
            preds, labels = [], []
            for index, batch in enumerate(valid_dataloader):
                model.eval()
                val_x, val_y = batch['x'].cuda(), batch['y'].cuda()
                predict = model(val_x)
                # CRF
                loss = model.log_likelihood(val_x, val_y)
                aver_loss += loss.item()
                # 统计非0的,也就是真实标签的长度
                leng = []
                for i in val_y.cpu():
                    tmp = []
                    for j in i:
                        if j.item() > 0:
                            tmp.append(j.item())
                    leng.append(tmp)

                for index, i in enumerate(predict):
                    preds += i[:len(leng[index])]

                for index, i in enumerate(val_y.tolist()):
                    labels += i[:len(leng[index])]
            aver_loss /= (len(valid_dataloader) * 64)
            precision = precision_score(labels, preds, average='macro')
            recall = recall_score(labels, preds, average='macro')
            f1 = f1_score(labels, preds, average='macro')
            # report = classification_report(labels, preds)
            # print(report)
            print('p', precision)
            print('r', recall)
            print('f1', f1)
            if f1 > best_score:
                best_score = f1
                path_name = './model/model' + str(epoch) + '----' + str(f1) + '.pkl'
                torch.save(model, path_name)
                print('model has been saved')
Ejemplo n.º 34
0
Archivo: run.py Proyecto: jsusu/Emr_Ner
def test1(model_path,output_file,output_file1):
    def list2tags(l_list):
        r = []
        for l in l_list:
            r.append(id2tag[l])
        return r
    with open(opt.pickle_train_path, 'rb') as inp:
        word2id = pickle.load(inp)
        id2word = pickle.load(inp)
        tag2id = pickle.load(inp)
        id2tag = pickle.load(inp)
        x_train = pickle.load(inp)
        y_train = pickle.load(inp)
        x_test = pickle.load(inp)
        y_test = pickle.load(inp)

    print("valid len", len(x_test))

    test_dataset = NERDataset(x_test, y_test)

    test_dataloader = DataLoader(test_dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers)

    model = torch.load(model_path)
    model.eval()

    aver_loss = 0
    preds, labels = [], []
    for index, batch in enumerate(test_dataloader):
        model.eval()
        val_x, val_y = batch['x'].cuda(), batch['y'].cuda()
        predict = model(val_x)
        # CRF
        loss = model.log_likelihood(val_x, val_y)
        aver_loss += loss.item()
        # 统计非0的,也就是真实标签的长度
        leng = []
        for i in val_y.cpu():
            tmp = []
            for j in i:
                if j.item() > 0:
                    tmp.append(j.item())
            leng.append(tmp)

        for index, i in enumerate(predict):
            preds += i[:len(leng[index])]

        for index, i in enumerate(val_y.tolist()):
            labels += i[:len(leng[index])]

    print('prediction\n' + str(len(list2tags(preds))) + '\n', list2tags(preds))
    print('labels\n'+ str(len(list2tags(labels))) + '\n', list2tags(labels))
    aver_loss /= (len(test_dataloader) * 64)
    # precision = precision_score(labels, preds, average='macro')
    # recall = recall_score(labels, preds, average='macro')
    # f1 = f1_score(labels, preds, average='macro')
    precision = precision_score(labels, preds, average='macro')
    recall = recall_score(labels, preds, average='macro')
    f1 = f1_score(labels, preds, average='macro')
    # report = classification_report(labels, preds)
    # print(report)
    print('p',precision)
    print('r',recall)
    print('f1',f1)
    p, r, f = get_f1score(list2tags(preds), list2tags(labels))
    print('p', p)
    print('r', r)
    print('f1', f)
    skf = StratifiedKFold(n_splits=5)
    confusion_matrices = []
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    for train, test in skf.split(X, Y):
        xtrain, xtest = X[train], X[test]
        ytrain, ytest = Y[train], Y[test]
        clf.fit(xtrain, ytrain)
        ypredict = clf.predict(xtest)
        confusion_matrices.append(confusion_matrix(ytest, ypredict))
        accuracies.append(accuracy_score(ytest, ypredict))
        precisions.append(precision_score(ytest, ypredict))
        recalls.append(recall_score(ytest, ypredict))
        f1s.append(f1_score(ytest, ypredict))

    print '5-fold cross-validation'
    print 'sum of confusion matrices'
    print sum(confusion_matrices)
    print 'average accuracy'
    print np.mean(accuracies)
    print 'average precision'
    print np.mean(precisions)
    print 'average recall'
    print np.mean(recalls)
    print 'average f1'
    print np.mean(f1s)

    # train a classifier on the full training set
    final_classifier = naive_bayes.BernoulliNB()
Ejemplo n.º 36
0
def get_metrics(path):
    precision = []
    recall = []
    fscore = []
    roc_auc = []
    results = []
    valid_dataset = 67
    test_scores = np.load(path)
    threshold = 0
    for i in range(67):
        values = np.load("data/A1X_" + str(i + 1) + ".npy")
        labels = np.load("data/A1Y_" + str(i + 1) + ".npy")
        test_portion = 0.2
        test_n = int(len(labels) * test_portion)
        train_values, test_values = values[:-test_n], values[-test_n:]
        train_labels, test_labels = labels[:-test_n], labels[-test_n:]
        test_score = test_scores[i]
        threshold = np.sum(test_score) / test_score.shape[0]
        test_correct = np.zeros(len(test_score))
        for j in range(len(test_labels) - window_size + 1):
            for k in range(window_size):
                if (test_labels[j + k] == 1):
                    test_correct[j] = 1
                    break

        # This is used for "threshold" detector (Detector 1 in report)
        predictions = (test_score < threshold).astype(np.int32)

        # Use the anomalous_num if taking advanced detection method (Detector 2 in report)
        #anomalous_num = np.where(test_correct==1)[0].shape[0]
        #predictions = np.zeros_like(test_correct)
        #predictions_idx = heapq.nsmallest(anomalous_num, range(len(test_score)), test_score.take)
        #predictions[predictions_idx] = 1

        precision.append(
            precision_score(test_correct, predictions, average="binary"))
        recall.append(recall_score(test_correct, predictions,
                                   average="binary"))
        fscore.append(f1_score(test_correct, predictions, average="binary"))
        if (np.sum(test_correct == 1) == 0
                or np.sum(test_correct == 1) == test_correct.shape[0]):
            roc_auc.append(0)
            valid_dataset -= 1
        else:
            roc_auc.append(roc_auc_score(test_correct, test_score))

        # This part is for augmenting the dataset with infrequent normal sampels
        """
        train_score = np.load("scores_on_trained_S.npy")
        train_correct = np.zeros(len(train_score[i]))
        for j in range(len(train_labels)-window_size+1):
            for k in range(window_size):
                if (train_labels[j+k] == 1):
                    train_correct[j] = 1
                    break
        a_n = np.where(train_correct==1)[0].shape[0]
        pred = np.zeros_like(train_correct)
        pred_idx = heapq.nsmallest(a_n, range(len(train_score[i])), train_score[i].take)
        pred[pred_idx] = 1
        augment_data(train_correct, train_values[:-119], pred, 50, "aug_data/A1X_"+str(i+1)+".npy", "aug_data/A1Y_"+str(i+1)+".npy")
        """
    precision = np.array(precision)
    recall = np.array(recall)
    fscore = np.array(fscore)
    roc_auc = np.array(roc_auc)
    #np.save("precision.npy", precision)
    #np.save("recall.npy", recall)
    #np.save("fscore.npy", fscore)
    #np.save("roc_auc.npy", roc_auc)
    print("Precision:", float(np.sum(precision)) / valid_dataset)
    print("Recall:", float(np.sum(recall)) / valid_dataset)
    print("Fscore:", float(np.sum(fscore)) / valid_dataset)
    print("AUC_Score:", float(np.sum(roc_auc)) / valid_dataset)
Ejemplo n.º 37
0
models_report = pd.DataFrame(columns=[
    'Model', 'Precision_score', 'Recall_score', 'F1_score', 'Accuracy'
])

for clf, clf_name in zip(clfs.values(), clfs.keys()):
    clf.fit(X_train_up, y_train_up)
    y_pred = clf.predict(X_test_up)
    y_score = clf.score(X_test_up, y_test_up)

    #print('Calculating {}'.format(clf_name))
    t = pd.Series({
        'Model': clf_name,
        'Precision_score': metrics.precision_score(y_test_up, y_pred),
        'Recall_score': metrics.recall_score(y_test_up, y_pred),
        'F1_score': metrics.f1_score(y_test_up, y_pred),
        'Accuracy': metrics.accuracy_score(y_test_up, y_pred)
    })

    models_report = models_report.append(t, ignore_index=True)

models_report

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=300,
                                    n_jobs=1,
                                    random_state=0,
                                    bootstrap=False)
classifier.fit(X_train_up, y_train_up)

y_pred = classifier.predict(X_test_up)
	auc_scores = []

	for train_index, test_index in kf.split(idx_train):
		X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
		y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

		elm.fit(X_train_fold,y_train_fold)

		y_pred       = elm.predict_proba(X_test_fold)[:, 1]
		yhat_classes = y_pred.copy()
		yhat_classes[yhat_classes>=threshold] = np.float64(1)
		yhat_classes[yhat_classes<threshold]  = np.float64(0)

		accuracy  = accuracy_score(y_test_fold, yhat_classes)
		loss      = log_loss(y_test_fold, yhat_classes)
		f1        = f1_score(y_test_fold, yhat_classes)
		precision = precision_score(y_test_fold, yhat_classes)
		recall    = recall_score(y_test_fold, yhat_classes)
		auc_score = roc_auc_score(y_test_fold, y_pred)

		accuracies.append(accuracy)
		losses.append(loss)
		f1s.append(f1)
		precisions.append(precision)
		recalls.append(recall)
		auc_scores.append(auc_score)

	end   = time.time()
	print('Accuracy: %f' % np.array(accuracies).mean())
	print('Precision: %f' % np.array(precisions).mean())
	print('Recall: %f' % np.array(recalls).mean())
Ejemplo n.º 39
0
def CNN_model(X_training,
              X_test,
              y_training,
              y_test,
              n_epochs=100,
              batch_size=256,
              model_name='model',
              history_file='model_accuracies.csv',
              conf_matrix=False,
              accuracy_report=False):

    while os.path.isfile(model_name + ".h5"):
        model_name = model_name + str(1)

    csv_logger = CSVLogger('model_training.log')
    plot_losses = my_callbacks.PlotLosses()
    metrics = my_callbacks.Metrics()
    f1_accuracy = my_callbacks.F1Metric()
    earlystop = EarlyStopping(monitor='val_acc', patience=10, mode='auto')
    adam = Adam(lr=0.00001,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=None,
                decay=0.0,
                amsgrad=False)

    model = Sequential()
    model.add(
        Conv1D(32,
               9,
               input_shape=(X_training.shape[1], 1),
               kernel_initializer=he_normal(seed=12),
               activation='relu',
               W_regularizer=l1_l2(0.01)))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(1))
    model.add(
        Conv1D(32,
               3,
               activation='relu',
               W_regularizer=l1_l2(0.01),
               padding='same'))
    model.add(MaxPooling1D(3, padding='same'))
    model.add(BatchNormalization())
    model.add(
        Conv1D(9,
               3,
               activation='relu',
               W_regularizer=l1_l2(0.01),
               padding='same'))
    model.add(MaxPooling1D(3, padding='same'))
    model.add(BatchNormalization())
    model.add(
        Conv1D(9,
               3,
               activation='relu',
               W_regularizer=l1_l2(0.01),
               padding='same'))
    model.add(MaxPooling1D(3, padding='same'))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(17, activation='softmax', input_shape=(1, )))
    model.compile(optimizer=adam,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    print('starts fitting model ...')
    start = time.time()
    model.fit(X_training,
              y_training,
              batch_size=batch_size,
              epochs=n_epochs,
              validation_data=(X_test, y_test),
              callbacks=[metrics, csv_logger])
    end = time.time()
    delta = end - start
    print('fitting time: ', delta)

    print('starts predicting model ...')
    start_prediction = time.time()
    model.predict(X_test)
    end_prediction = time.time()
    delta_prediction = end_prediction - start_prediction
    print('prediction time: ', delta_prediction)

    y_pred = model.predict_classes(X_test)

    model.save_weights(model_name + ".h5")
    print('weights saved to disk')

    model_json = model.to_json()
    with open(model_name + '.json', 'w') as json_file:
        json_file.write(model_json)
    print('model saved to disk')

    with open(history_file, 'a', newline='') as history:
        writer = csv.writer(history, delimiter=';')
        writer.writerow([
            model_name,
            accuracy_score(y_test, y_pred),
            cohen_kappa_score(y_test, y_pred),
            f1_score(y_test, y_pred, average='weighted'), delta,
            delta_prediction
        ])

    if conf_matrix:
        cm_filename = model_name + '_cm.csv'
        cm = pd.DataFrame(confusion_matrix(y_test, y_pred))
        cm.to_csv(cm_filename)

    if accuracy_report:
        raport_filename = model_name + '_report.csv'
        report = classification_report(y_test, y_pred)
        with open(raport_filename, 'w') as acc_report:
            acc_report.write(report)

    return y_pred
Ejemplo n.º 40
0
            'KNeighborsClassifier'
    ], [pe_v, nb_v, dt_v, rf_v, lr_v, kn_v]):
        print(x)
        error = (Y_test != y).sum()
        p[0].append(error)
        print("Errors    : %d" % error)
        acc = accuracy_score(y, Y_test) * 100
        p[1].append(acc)
        print("Accuracy  : %.2f%%" % acc)
        ps = precision_score(y, Y_test) * 100
        p[2].append(ps)
        print("Precision : %.2f%%" % ps)
        rs = recall_score(y, Y_test) * 100
        p[3].append(rs)
        print("Recall    : %.2f%%" % rs)
        f1 = f1_score(y, Y_test) * 100
        p[4].append(f1)
        print("F1 Score  : %.2f%% \n" % f1)

    print("\n")

print("WITHOUT PCA")
# Perceptron Model
pe = Perceptron(n_iter=10, eta0=10, n_jobs=-1)
pe.fit(X_train_sd, Y_train)

# Naive Bayes Classification
nb = GaussianNB()
nb.fit(X_train, Y_train)

# Decision Tree Classifier
Ejemplo n.º 41
0
def main():
    # Could/should refactor this whole thing
    args = parseCmdLine()

    myModule = sklearnHelperLib.importPyFile(args.pipelineDefs)
    pipelines = myModule.pipelines

    if type(pipelines) != type([]): 
	pipelines = [ pipelines ]

    if args.vote: nPipelinesAndVotes = len(pipelines)+1	# include votes
    else: nPipelinesAndVotes = len(pipelines)

    # totals across all the split tries for each pipeline + voted predictions
    #  for computing averages
    pipelineTotals = [ {'fscores':0,
			'precisions': 0,
			'f1': 0,
			'recalls': 0, } for i in range(nPipelinesAndVotes) ]

    # formats for output lines, Pipeline line, votes line, avg line
    pf="Pipeline %d:   F1: %5.3f   F%d: %5.3f   Precision: %4.2f   Recall: %4.2f"
    vf="Votes... %d:   F1: %5.3f   F%d: %5.3f   Precision: %4.2f   Recall: %4.2f"
    af="Average. %d:   F1: %5.3f   F%d: %5.3f   Precision: %4.2f   Recall: %4.2f"

    dataSet = load_files( args.trainingData )
    labelIndex = dataSet['target_names'].index(args.label)

    for sp in range(args.numSplits):
	docs_train, docs_test, y_train, y_test = \
		train_test_split( dataSet.data, dataSet.target,
				test_size=args.testSize, random_state=None)

	predictions = []	# predictions[i]= predictions for ith Pipeline
				#  on this split (for voting)
	print "Sample Split %d" % sp
	for i, pl in enumerate(pipelines):	# for each Pipeline

	    pl.fit(docs_train, y_train)
	    y_pred = pl.predict(docs_test)
	    predictions.append(y_pred)

	    precision, recall, fscore, support = \
			    precision_recall_fscore_support( \
						y_test, y_pred, args.beta,
						pos_label=labelIndex,
						average='binary')
	    f1 = f1_score(y_test, y_pred, pos_label=labelIndex,
							average='binary')
	    pipelineTotals[i]['fscores']    += fscore
	    pipelineTotals[i]['f1']	    += f1
	    pipelineTotals[i]['precisions'] += precision
	    pipelineTotals[i]['recalls']    += recall

	    l = pf % (i, f1, args.beta, fscore, precision, recall)
	    print l

	if args.vote:
	    vote_pred = y_vote( predictions )
	    precision, recall, fscore, support = \
				precision_recall_fscore_support( \
						y_test, vote_pred, args.beta,
						pos_label=labelIndex,
						average='binary')
	    f1 = f1_score(y_test, vote_pred, pos_label=labelIndex,
							average='binary')
	    i = len(pipelines)
	    pipelineTotals[i]['fscores']    += fscore
	    pipelineTotals[i]['f1']	    += f1
	    pipelineTotals[i]['precisions'] += precision
	    pipelineTotals[i]['recalls']    += recall

	    l = vf % (i , f1, args.beta, fscore, precision, recall)
	    print l
    # averages across all the Splits
    print
    for i in range(nPipelinesAndVotes):
	avgFscore    = pipelineTotals[i]['fscores']    / args.numSplits
	avgF1        = pipelineTotals[i]['f1']         / args.numSplits
	avgPrecision = pipelineTotals[i]['precisions'] / args.numSplits
	avgRecall    = pipelineTotals[i]['recalls']    / args.numSplits
	l = af % (i, avgF1, args.beta, avgFscore, avgPrecision, avgRecall)
	print l

    # pipeline info
    print "\nTraining data: %s" % args.trainingData
    print time.strftime("%Y/%m/%d-%H-%M-%S")
    for i,p in  enumerate(pipelines):
	print "\nPipeline %d -------------" % i
	for s in p.steps:
	    print s
Ejemplo n.º 42
0
model_xgb = XGBClassifier(scale_pos_weight=3,
                          learning_rate=0.2,
                          n_estimators=200,
                          min_child_weight=20,
                          max_depth=3,
                          base_score=0.5,
                          gamma=0,
                          n_jobs=4)
# eval = [(X_test, y_test)]
# model_xgb.fit(X_train, y_train, eval_set=eval, eval_metric='auc', early_stopping_rounds=20, verbose=True)
model_xgb.fit(X, Y)
y_pred = model_xgb.predict(X_test)

p = precision_score(y_test, y_pred, average='binary')
r = recall_score(y_test, y_pred, average='binary')
f1score = f1_score(y_test, y_pred, average='binary')
print(p)
print(r)
print(f1score)

plot_importance(model_xgb, importance_type='gain')
pyplot.rcParams["font.sans-serif"] = ["Microsoft YaHei"]
pyplot.rcParams['axes.unicode_minus'] = False
pyplot.show()

# y_real_pred = model_xgb.predict(X_real_test)
# y_real_pred = grid_search.predict(X_real_test)
# y_real_id = pd.read_csv('data\df_id_test.csv', index_col=0, names=["个人编码", "result"])
# temp = pd.DataFrame({"result": y_real_pred}, index=X_real_test.index)
# print(temp["result"].sum())
# y_real_id["result"] = temp["result"]
	for i in range(0, np.shape(features)[0], predict_batch):
		predictions[i:i+predict_batch] = clf.predict(features[
			i:i+predict_batch])

	predictions_prob = np.zeros((np.shape(features)[0],len(np.unique(label))))
	for i in range(0, np.shape(features)[0], predict_batch):
		predictions_prob[i:i+predict_batch] = clf.predict_proba(features[
			i:i+predict_batch])
	np.save('predictions.npy',predictions)

	np.save('predictions_prob.npy',predictions_prob)
	predictions=predictions.astype(np.uint8)

		
	print("predictions",predictions.shape,np.unique(predictions),predictions.dtype)
	print("label_test",label.shape,np.unique(label),label.dtype)
	predictions=predictions.astype(np.uint8)

	metrics={}
	metrics['f1_score']=f1_score(label,predictions,average=None)
	metrics['f1_score_weighted']=f1_score(label,predictions,average='weighted')
			
	metrics['overall_acc']=accuracy_score(label,predictions)
	confusion_matrix_=confusion_matrix(label,predictions)
	metrics['per_class_acc']=(confusion_matrix_.astype('float') / confusion_matrix_.sum(axis=1)[:, np.newaxis]).diagonal()
			
	metrics['average_acc']=np.average(metrics['per_class_acc'][~np.isnan(metrics['per_class_acc'])])
	print(metrics)
	print(confusion_matrix_)

Ejemplo n.º 44
0
    # A prediction is made on the test dataset based on the model fitted on train set
    y_pred = clf.predict(x_test)

    # Various performance metrics were found and reported for each k
    print "k =", k
    """
	Report all performance metrics and append values to their respective arrays
	"""
    accuracies.append(accuracy_score(y_test, y_pred))
    print 'Accuracy:', accuracy_score(y_test, y_pred)

    precisions.append(precision_score(y_test, y_pred))
    print 'Precision:', precision_score(y_test, y_pred)

    recalls.append(recall_score(y_test, y_pred))
    print 'Recall:', recall_score(y_test, y_pred)

    fscores.append(f1_score(y_test, y_pred))
    print 'F1-Score:', f1_score(y_test, y_pred)

# Find best fit k based on their accuracies
best_k_index = np.argmax(accuracies)
"""
Display performance metrics for best fit k value
"""
print "Best fit k =", best_k_index + 3
print 'Best fit Accuracy:', accuracies[best_k_index]
print 'Best fit Precision:', precisions[best_k_index]
print 'Best fit Recall:', recalls[best_k_index]
print 'Best fit F1-Score:', fscores[best_k_index]
Ejemplo n.º 45
0
def gini_samples_f1(truth, predictions):
    return f1_score(truth, predictions.argmax(axis=1), average='samples')
Ejemplo n.º 46
0
    def train(self, data_dir):
        '''
        Trains a single layer model on the data contained in the specified
        directory.  Labels found in the directory are augmented with an
        unknown label.

        Args:
            data_dir: Directory containing the training data
        '''

        print("Reading data")
        # First read the data directory for the features and labels
        X_all, y_all, new_labels = read_data(
                                              data_dir,
                                              duration=self.duration,
                                              labels=self.labels
                                            )
        self.labels = new_labels

        print("Making data splits")
        # Split the data into training, validation, and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
                                                            X_all,
                                                            y_all,
                                                            test_size=0.2,
                                                            random_state=0
                                                           )

        print("Normalizing features")
        # Mean normalize the features, saving the means and variances
        self.means = X_train.mean(axis=0)
        self.stds = X_train.std(axis=0)
        # Set the zero standard deviations to 1
        zero_stds = self.stds <= 1
        self.stds[zero_stds] = 1
        # Apply the mean normalization transformation to the training dataj
        X_normed = X_train - np.expand_dims(self.means, 0)
        X_normed /= np.expand_dims(self.stds, 0)

        print("Doing feature selection")
        # Select the relevant features from the training set
        self.feature_list = select_features(X_normed, y_train)
        print(self.feature_list)

        # If hidden size wasn't specified, default to the mean of the number
        # of features and the size of the label space
        if self.hidden_size is None:
            self.hidden_size = int(1/2*(
                                        len(self.labels) + \
                                        len(self.feature_list)
                                       )
                                  )

        # Augment the data with randomly permuted samples
        X_aug, y_aug = self._augment_data(X_normed, y_train)

        # Fit the one layer model to the augmented training data
        X_input = X_aug[:, self.feature_list]
        self.model = MLPClassifier(
                                    (self.hidden_size),
                                    alpha=0.1,
                                    activation='relu',
                                    max_iter=1000
                                  )

        self.model.fit(X_input, y_aug)

        # Evaulate the model on the augmented test data
        X_test_input = X_test - np.expand_dims(self.means, 0)
        X_test_input /= np.expand_dims(self.stds, 0)
        X_test_aug, y_test_aug = self._augment_data(X_test_input, y_test)
        predictions = self.model.predict(X_test_aug[:, self.feature_list])
        print("F1 score:",
                f1_score(y_test_aug, predictions, average='weighted'))
Ejemplo n.º 47
0
def gini_weighted_f1(truth, predictions):
    return f1_score(truth, predictions.argmax(axis=1), average='weighted')
Ejemplo n.º 48
0
 def score(self, X, y, print_report=False):
     predictions = self.predict(X)
     if print_report:
         print(confusion_matrix(y, predictions))
         print(classification_report(y, predictions, digits=3))
     return f1_score(y, predictions, average="macro")
Ejemplo n.º 49
0
            string_files = string_files[int(len(string_files) *
                                            args['split']):]
            struct_files = struct_files[int(len(struct_files) *
                                            args['split']):]
            dynamc_files = dynamc_files[int(len(dynamc_files) *
                                            args['split']):]
            print('---------- ENSEMBLE MODEL ----------')
            res = ensemble(
                string_files, struct_files, dynamc_files,
                string_model.predict(map(itemgetter(1), string_files)),
                structure_model.predict(map(itemgetter(1), struct_files)),
                dynamic_model.predict(map(itemgetter(1), dynamc_files)))
            # Write the sklearn step here please on x=res[1] and y=res[2]
            print("accuracy:\t\t\t", metrics.accuracy_score(res[1], res[2]))
            print("f1 score (micro):\t\t",
                  metrics.f1_score(res[1], res[2], average='micro'))
            print("precision score (micro):\t",
                  metrics.precision_score(res[1], res[2], average='micro'))
            print("recall score (micro):\t\t",
                  metrics.recall_score(res[1], res[2], average='micro'))
            print("f1 score (macro):\t\t",
                  metrics.f1_score(res[1], res[2], average='macro'))
            print("precision score (macro):\t",
                  metrics.precision_score(res[1], res[2], average='macro'))
            print("recall score (macro):\t\t",
                  metrics.recall_score(res[1], res[2], average='macro'))

        elif mode == 'validate':
            if args['choose']:
                print(
                    'The parameter "choose" is unavailable for this operation.'
Ejemplo n.º 50
0
def gini_micro_f1(truth, predictions):
    return f1_score(truth, predictions.argmax(axis=1), average='micro')
                      params,
                      n_jobs=1,
                      cv=5,
                      return_train_score=True,
                      scoring={'f1_score': make_scorer(f1_score, average='macro'),
                               'accuracy': 'accuracy'}, 
                      refit='f1_score',   
                      verbose=10,
                      error_score='raise')
best = search.fit(X[:, :, :], Y[:])
print(best.__dict__)
print("BEST PARAMS: ", best.best_params_)

model_dir = 'models'
if not os.path.exists(model_dir):
  os.makedirs(model_dir)

best_estimator = best.best_estimator_
best_estimator.fit(X, Y)
predicted_y = best_estimator.predict(X_test)
print("TEST EVALUATION")
print("F1-SCORE: ", f1_score(Y_test, predicted_y, average='macro'))
print("ACCURACY: ", accuracy_score(Y_test, predicted_y))
auc_roc = roc_auc_score(Y_test, predicted_y)
print("AUROC score : %s "% acc)
precision, recall, _ = precision_recall_curve(Y_test, predicted_y)
auc_prc = auc(recall, precision)
print("AUPRC score : %s "% auc_prc)
print(confusion_matrix(Y_test, predicted_y))

best_estimator.model.save(os.path.join(model_dir, str(datetime.now().strftime("%Y%m%d-%H%M%S"))))
Ejemplo n.º 52
0
def gini_f1(truth, predictions, pos_label=1):
    return f1_score(truth, predictions, average=None)[pos_label]
Ejemplo n.º 53
0
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                label,
                                                test_size=.3,
                                                random_state=1984)
print("len(X) = %d" % (len(X)))
print("len(Xtrain) = %d" % (len(Xtrain)))
print("len(Xtest) = %d" % (len(Xtest)))

model = BernoulliNB()
model.fit(Xtrain, Ytrain)
prediction = model.predict(Xtest)
accuracy = (Ytest == prediction).mean()

precision_score = precision_score(Ytest, prediction, average="macro")
recall_score = recall_score(Ytest, prediction, average="macro")
f1_score = f1_score(Ytest, prediction, average="macro")

print("accuracy %.3f" % accuracy)
print("precision_score %.3f" % precision_score)
print("recall_score %.3f" % recall_score)
print("f1_score %.3f" % f1_score)

confusion_matrix = confusion_matrix(Ytest, prediction, labels=[0, 1, 2, 3])
print(confusion_matrix)
print("VOIR LE RESULTAT DANS FICHIER NBLibrary.xlsx")

############ Ecrire resultat sur un fichier Excel ###############
export("NBLibrary.xlsx", confusion_matrix, len(X), len(Xtrain), len(Xtest),
       accuracy, precision_score, recall_score, f1_score)
'''
import os
Ejemplo n.º 54
0
#print(classifier_Y.shape[0])

k_means_classifier = KNeighborsClassifier(n_neighbors=10)
k_means_classifier.fit(c_x_train, c_y_train)

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(c_x_train, c_y_train)

k_means_guesses = k_means_classifier.predict(c_x_test)
svm_guesses = svm_classifier.predict(c_x_test)

print("K-Means Accuracy, Recall, Precision, and F1 score are as follows:")
print(accuracy_score(c_y_test, k_means_guesses))
print(recall_score(c_y_test, k_means_guesses, average='macro'))
print(precision_score(c_y_test, k_means_guesses, average='macro'))
print(f1_score(c_y_test, k_means_guesses, average='macro'))

print(
    "Support Vector Machine Accuracy, Recall, Precision, and F1 score are as follows:"
)
print(accuracy_score(c_y_test, svm_guesses))
print(recall_score(c_y_test, svm_guesses, average='macro'))
print(precision_score(c_y_test, svm_guesses, average='macro'))
print(f1_score(c_y_test, svm_guesses, average='macro'))

#print(guesses)

###  ----- Normalization of Regression Data
# ----------------------------------------------------------------------------------

feature_data1 = df[['sex_code', 'education_code']]
X = pd.read_csv('D:/SKRIPSI/percobaan/1332data9klas/tfidf1332.csv')


# In[2]:

kf = KFold(len(X), n_folds=10, shuffle=True, random_state=9999)
model_train_index = []
model_test_index = []
model = 0

for k, (index_train, index_test) in enumerate(kf):
    X_train, X_test, y_train, y_test = X.ix[index_train,:], X.ix[index_test,:],y[index_train], y[index_test]
    clf = MultinomialNB(alpha=0.1,  fit_prior=True, class_prior=None).fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    f1score = f1_score(y_test, clf.predict(X_test))
    precision = precision_score(y_test, clf.predict(X_test))
    recall = recall_score(y_test, clf.predict(X_test))
    print('Model %d has accuracy %f with | f1score: %f | precision: %f | recall : %f'%(k,score, f1score, precision, recall))
    model_train_index.append(index_train)
    model_test_index.append(index_test)
    model+=1


# In[5]:

temp = df.klasifikasi


# In[ ]:
Ejemplo n.º 56
0
def main(_):
    # word_id_mapping_o, w2v_o = load_w2v(FLAGS.embedding_file, FLAGS.embedding_dim, True)
    word_id_mapping_o, w2v_o = load_word_embedding(FLAGS.word_id_file, FLAGS.embedding_file, FLAGS.embedding_dim, True)
    word_embedding_o = tf.constant(w2v_o, dtype=tf.float32)
    # word_id_mapping_r, w2v_r = load_w2v(FLAGS.embedding_file_r, FLAGS.embedding_dim, True)
    # word_id_mapping_r, w2v_r = load_word_embedding(FLAGS.word_id_file, FLAGS.embedding_file_r, FLAGS.embedding_dim, True)
    word_id_mapping_r = word_id_mapping_o
    word_embedding_r = tf.constant(w2v_o, dtype=tf.float32)

    with tf.name_scope('inputs'):
        keep_prob1 = tf.placeholder(tf.float32)
        keep_prob2 = tf.placeholder(tf.float32)
        x_o = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len, FLAGS.max_sentence_len])
        x_r = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len, FLAGS.max_sentence_len])
        sen_len_o = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len])
        sen_len_r = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len])
        doc_len_o = tf.placeholder(tf.int32, None)
        doc_len_r = tf.placeholder(tf.int32, None)
        y = tf.placeholder(tf.float32, [None, FLAGS.n_class])

        inputs_o = tf.nn.embedding_lookup(word_embedding_o, x_o)
        inputs_o = tf.reshape(inputs_o, [-1, FLAGS.max_sentence_len, FLAGS.embedding_dim])
        inputs_r = tf.nn.embedding_lookup(word_embedding_r, x_r)
        inputs_r = tf.reshape(inputs_r, [-1, FLAGS.max_sentence_len, FLAGS.embedding_dim])

    prob = hn_inter_att(inputs_o, sen_len_o, doc_len_o, inputs_r, sen_len_r, doc_len_r, keep_prob1, keep_prob2)

    with tf.name_scope('loss'):
        reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prob, labels=y)) + tf.add_n(reg_loss)
        all_vars = [var for var in tf.global_variables()]

    with tf.name_scope('train'):
        global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
        grads, global_norm = tf.clip_by_global_norm(tf.gradients(loss, all_vars), 5.0)
        train_op = optimizer.apply_gradients(zip(grads, all_vars), name='train_op', global_step=global_step)

    with tf.name_scope('predict'):
        cor_pred = tf.equal(tf.argmax(prob, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(cor_pred, tf.float32))
        accuracy_num = tf.reduce_sum(tf.cast(cor_pred, tf.int32))

    true_y = tf.argmax(y, 1)
    pred_y = tf.argmax(prob, 1)

    title = '-d1-{}d2-{}b-{}r-{}l2-{}sen-{}dim-{}h-{}c-{}'.format(
        FLAGS.keep_prob1,
        FLAGS.keep_prob2,
        FLAGS.batch_size,
        FLAGS.learning_rate,
        FLAGS.l2_reg,
        FLAGS.max_sentence_len,
        FLAGS.embedding_dim,
        FLAGS.n_hidden,
        FLAGS.n_class
    )

    def get_batch_data(xo, slo, dlo, xr, slr, dlr, yy, batch_size, kp1, kp2, is_shuffle=True):
        for index in batch_index(len(yy), batch_size, 1, is_shuffle):
            feed_dict = {
                x_o: xo[index],
                x_r: xr[index],
                y: yy[index],
                sen_len_o: slo[index],
                sen_len_r: slr[index],
                doc_len_o: dlo[index],
                doc_len_r: dlr[index],
                keep_prob1: kp1,
                keep_prob2: kp2,
            }
            yield feed_dict, len(index)

    conf = tf.ConfigProto(allow_soft_placement=True)
    conf.gpu_options.allow_growth = True
    with tf.Session(config=conf) as sess:
        import time
        timestamp = str(int(time.time()))
        _dir = 'summary/' + str(timestamp) + '_' + title
        test_loss = tf.placeholder(tf.float32)
        test_acc = tf.placeholder(tf.float32)
        train_summary_op, test_summary_op, validate_summary_op, train_summary_writer, test_summary_writer, \
        validate_summary_writer = summary_func(loss, accuracy, test_loss, test_acc, _dir, title, sess)

        save_dir = 'temp_model/' + str(timestamp) + '_' + title + '/'
        saver = saver_func(save_dir)

        init = tf.global_variables_initializer()
        sess.run(init)

        # saver.restore(sess, '/-')

        tr_x, tr_y, tr_sen_len, tr_doc_len = load_inputs_document(
            FLAGS.train_file,
            word_id_mapping_o,
            FLAGS.max_sentence_len,
            FLAGS.max_doc_len
        )
        te_x, te_y, te_sen_len, te_doc_len = load_inputs_document(
            FLAGS.test_file,
            word_id_mapping_o,
            FLAGS.max_sentence_len,
            FLAGS.max_doc_len
        )
        tr_x_r, tr_y_r, tr_sen_len_r, tr_doc_len_r = load_inputs_document(
            FLAGS.train_file_r,
            word_id_mapping_r,
            FLAGS.max_sentence_len,
            FLAGS.max_doc_len
        )
        te_x_r, te_y_r, te_sen_len_r, te_doc_len_r = load_inputs_document(
            FLAGS.test_file_r,
            word_id_mapping_r,
            FLAGS.max_sentence_len,
            FLAGS.max_doc_len
        )
        # v_x, v_y, v_sen_len, v_doc_len = load_inputs_document(
        #     FLAGS.validate_file_path,
        #     word_id_mapping,
        #     FLAGS.max_sentence_len,
        #     FLAGS.max_doc_len
        # )

        # v_x, v_y, v_sen_len, v_doc_len = load_inputs_document(
        #     FLAGS.validate_file_path,
        #     word_id_mapping,
        #     FLAGS.max_sentence_len,
        #     FLAGS.max_doc_len
        # )

        max_acc, max_prob, step = 0., None, None
        max_ty, max_py = None, None
        for i in xrange(FLAGS.n_iter):
            for train, _ in get_batch_data(tr_x, tr_sen_len, tr_doc_len, tr_x_r, tr_sen_len_r, tr_doc_len_r, tr_y,
                                           FLAGS.batch_size, FLAGS.keep_prob1, FLAGS.keep_prob2):
                _, step, summary = sess.run([train_op, global_step, train_summary_op], feed_dict=train)
                train_summary_writer.add_summary(summary, step)
                # embed_update = tf.assign(word_embedding, tf.concat([tf.zeros([1, FLAGS.embedding_dim]), word_embedding[1:]]), 0)
                # sess.run(embed_update)

            acc, cost, cnt = 0., 0., 0
            p, ty, py = [], [], []
            for test, num in get_batch_data(te_x, te_sen_len, te_doc_len, te_x_r, te_sen_len_r, te_doc_len_r, te_y, FLAGS.batch_size, 1.0, 1.0, False):
                _loss, _acc, _p, _ty, _py = sess.run([loss, accuracy_num, prob, true_y, pred_y], feed_dict=test)
                p += list(_p)
                ty += list(_ty)
                py += list(_py)
                acc += _acc
                cost += _loss * num
                cnt += num
            print 'all samples={}, correct prediction={}'.format(cnt, acc)
            acc = acc / cnt
            cost = cost / cnt
            print 'Iter {}: mini-batch loss={:.6f}, test acc={:.6f}'.format(i, cost, acc)
            summary = sess.run(test_summary_op, feed_dict={test_loss: cost, test_acc: acc})
            test_summary_writer.add_summary(summary, step)
            if acc > max_acc:
                max_acc = acc
                max_prob = p
                max_ty = ty
                max_py = py
                # saver.save(sess, save_dir, global_step=step)

        print 'P:', precision_score(max_ty, max_py, average=None)
        print 'R:', recall_score(max_ty, max_py, average=None)
        print 'F:', f1_score(max_ty, max_py, average=None)

        fp = open(FLAGS.prob_file, 'w')
        for item in max_prob:
            fp.write(' '.join([str(it) for it in item]) + '\n')
        print 'Optimization Finished! Max acc={}'.format(max_acc)

        print 'Learning_rate={}, iter_num={}, batch_size={}, hidden_num={}, l2={}'.format(
            FLAGS.learning_rate,
            FLAGS.n_iter,
            FLAGS.batch_size,
            FLAGS.n_hidden,
            FLAGS.l2_reg
        )
    for i in features:  ## run for every  feature
        my_list.append(i)
        X = df.loc[:, my_list].values  # data

        ## cross-validation
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.20,
                                                            random_state=0)

        #machine learning algorithm is applied in this section
        clf = ml_list[j]  #
        clf.fit(X_train, y_train)
        predict = clf.predict(X_test)
        f1 = clf.score(X_test, y_test)
        result = f1_score(y_test, predict, average='macro')
        accuracy = round(clf.score(X_test, y_test), 2)
        temp = "["

        for ii in my_list:
            temp += str(
                my_list.index(ii) + 1
            ) + ", "  #translate property list to sequence number for less space

        if result >= least:  # If the F-criterion is equal to or greater than the highest value previously accessed, keep the new feature.
            least = result
            print(
                '%-17s %-30s %-10s  %-10s %-15s %-15s ' %
                (j, i, result, accuracy, temp, "------> New feature found!!!"))

        else:  #If not, remove it from the list
Ejemplo n.º 58
0
def test(patch_shape, extraction_step):

    with tf.Graph().as_default():
        test_patches = tf.placeholder(tf.float32, [
            F.batch_size, patch_shape[0], patch_shape[1], patch_shape[2],
            F.num_mod
        ],
                                      name='real_patches')
        phase = tf.placeholder(tf.bool)

        # Define the network
        # For using actual 3-D U-Net change ***trained_network*** function both in training and testing
        #output_soft = trained_network(test_patches, phase, patch_shape, reuse=None)
        output_soft = trained_network_dis(test_patches, reuse=None)

        # To convert from one hat form
        output = tf.argmax(output_soft, axis=-1)
        print("Output Patch Shape:", output.get_shape())

        # To load the saved checkpoint
        saver = tf.train.Saver()
        with tf.Session() as sess:
            try:
                load_model(F.best_checkpoint_dir, sess, saver)
                print(" Checkpoint loaded succesfully!....\n")
            except:
                print(" [!] Checkpoint loading failed!....\n")
                return

            # Get patches from test images
            patches_test, labels_test = preprocess_dynamic_lab(
                F.data_directory,
                F.num_classes,
                extraction_step,
                patch_shape,
                F.number_train_images,
                validating=F.training,
                testing=F.testing,
                num_images_testing=F.number_test_images)
            total_batches = int(patches_test.shape[0] / F.batch_size)

            # Array to store the prediction results
            predictions_test = np.zeros((patches_test.shape[0], patch_shape[0],
                                         patch_shape[1], patch_shape[2]))

            print("max and min of patches_test:", np.min(patches_test),
                  np.max(patches_test))

            # Batch wise prediction
            print("Total number of Batches: ", total_batches)
            for batch in range(total_batches):
                patches_feed = patches_test[batch * F.batch_size:(batch + 1) *
                                            F.batch_size, :, :, :, :]
                preds = sess.run(output,
                                 feed_dict={
                                     test_patches: patches_feed,
                                     phase: False
                                 })
                predictions_test[batch * F.batch_size:(batch + 1) *
                                 F.batch_size, :, :, :] = preds
                print(("Processed_batch:[%8d/%8d]") % (batch, total_batches))

            print("All patches Predicted")

            print("Shape of predictions_test, min and max:",
                  predictions_test.shape, np.min(predictions_test),
                  np.max(predictions_test))

            #To stitch the image back
            images_pred = recompose3D_overlap(predictions_test, 144, 192, 256,
                                              extraction_step[0],
                                              extraction_step[1],
                                              extraction_step[2])

            print("Shape of Predicted Output Groundtruth Images:",
                  images_pred.shape, np.min(images_pred), np.max(images_pred),
                  np.mean(images_pred), np.mean(labels_test))

            # To save the images
            for i in range(F.number_test_images):
                pred2d = np.reshape(images_pred[i], (144 * 192 * 256))
                lab2d = np.reshape(labels_test[i], (144 * 192 * 256))
                save_image(F.results_dir, images_pred[i],
                           F.number_train_images + i + 2)
                F1_score = f1_score(lab2d, pred2d, [0, 1, 2, 3], average=None)

            # Evaluation
            pred2d = np.reshape(images_pred,
                                (images_pred.shape[0] * 144 * 192 * 256))
            lab2d = np.reshape(labels_test,
                               (labels_test.shape[0] * 144 * 192 * 256))

            F1_score = f1_score(lab2d, pred2d, [0, 1, 2, 3], average=None)
            print("Testing Dice Coefficient.... ")
            print("Background:", F1_score[0])
            print("CSF:", F1_score[1])
            print("GM:", F1_score[2])
            print("WM:", F1_score[3])

    return
Ejemplo n.º 59
0
    def evaluate(self,dev,avg_best,BLEU=False):
        logging.info("STARTING EVALUATION")
        acc_avg = 0.0
        wer_avg = 0.0
        acc_G = 0.0
        acc_P = 0.0
        acc_V = 0.0
        microF1_PRED,microF1_PRED_cal,microF1_PRED_nav,microF1_PRED_wet = [],[],[],[]
        microF1_TRUE,microF1_TRUE_cal,microF1_TRUE_nav,microF1_TRUE_wet = [],[],[],[]
        ref = []
        hyp = []
        ref_s = ""
        hyp_s = ""
        pbar = tqdm(enumerate(dev),total=len(dev))
        for j, data_dev in pbar:
            # (T,B) a list of list
            words = self.evaluate_batch(len(data_dev[1]),data_dev[0],data_dev[1],data_dev[2],data_dev[3],data_dev[4],data_dev[5],data_dev[6])            
            acc=0
            w = 0
            temp_gen = []
            for i, row in enumerate(np.transpose(words)):       # (B,T)
                st = ''
                for e in row:
                    if e== '<EOS>':
                        break
                    else:
                        st+= e + ' '
                temp_gen.append(st)
                correct = data_dev[7][i]
                ### compute F1 SCORE  
                if(len(data_dev)>10):
                    f1_true,f1_pred = computeF1(data_dev[8][i],st.lstrip().rstrip(),correct.lstrip().rstrip())
                    microF1_TRUE += f1_true
                    microF1_PRED += f1_pred

                    f1_true,f1_pred = computeF1(data_dev[9][i],st.lstrip().rstrip(),correct.lstrip().rstrip())
                    microF1_TRUE_cal += f1_true
                    microF1_PRED_cal += f1_pred 

                    f1_true,f1_pred = computeF1(data_dev[10][i],st.lstrip().rstrip(),correct.lstrip().rstrip())
                    microF1_TRUE_nav += f1_true
                    microF1_PRED_nav += f1_pred 

                    f1_true,f1_pred = computeF1(data_dev[11][i],st.lstrip().rstrip(),correct.lstrip().rstrip()) 
                    microF1_TRUE_wet += f1_true
                    microF1_PRED_wet += f1_pred  
                
                if (correct.lstrip().rstrip() == st.lstrip().rstrip()):
                    acc+=1
                w += wer(correct.lstrip().rstrip(),st.lstrip().rstrip())
                ref.append(str(correct.lstrip().rstrip()))
                hyp.append(str(st.lstrip().rstrip()))
                ref_s+=str(correct.lstrip().rstrip())+ "\n"
                hyp_s+=str(st.lstrip().rstrip()) + "\n"

            acc_avg += acc/float(len(data_dev[1]))
            wer_avg += w/float(len(data_dev[1]))
            pbar.set_description("R:{:.4f},W:{:.4f}".format(acc_avg/float(len(dev)),wer_avg/float(len(dev))))
        if(len(data_dev)>10):
            logging.info("F1 SCORE:\t"+str(f1_score(microF1_TRUE, microF1_PRED, average='micro')))
            logging.info("F1 CAL:\t"+str(f1_score(microF1_TRUE_cal, microF1_PRED_cal, average='micro')))
            logging.info("F1 WET:\t"+str(f1_score(microF1_TRUE_wet, microF1_PRED_wet, average='micro')))
            logging.info("F1 NAV:\t"+str(f1_score(microF1_TRUE_nav, microF1_PRED_nav, average='micro')))

        if (BLEU):       
            bleu_score = moses_multi_bleu(np.array(hyp), np.array(ref), lowercase=True) 
            logging.info("BLEU SCORE:"+str(bleu_score))     
                                                                      
            if (bleu_score >= avg_best):
                self.save_model(str(self.name)+str(bleu_score))
                logging.info("MODEL SAVED")
            return bleu_score
        else:              
            acc_avg = acc_avg/float(len(dev))
            if (acc_avg >= avg_best):
                self.save_model(str(self.name)+str(acc_avg))
                logging.info("MODEL SAVED")
            return acc_avg
def train_or_eval_model(model,
                        loss_function,
                        dataloader,
                        epoch,
                        optimizer=None,
                        train=False):
    losses = []
    preds = []
    labels = []
    masks = []
    alphas, alphas_f, alphas_b, vids = [], [], [], []
    assert not train or optimizer != None
    if train:
        model.train()
    else:
        model.eval()
    for data in dataloader:
        if train:
            optimizer.zero_grad()
        # import ipdb;ipdb.set_trace()
        acouf, qmask, umask, label =\
                [d.cuda() for d in data[:-1]] if cuda else data[:-1]
        #log_prob = model(torch.cat((textf,acouf,visuf),dim=-1), qmask,umask) # seq_len, batch, n_classes
        log_prob, alpha, alpha_f, alpha_b = model(
            acouf, qmask, umask)  # seq_len, batch, n_classes
        lp_ = log_prob.transpose(0, 1).contiguous().view(
            -1,
            log_prob.size()[2])  # batch*seq_len, n_classes
        labels_ = label.view(-1)  # batch*seq_len
        loss = loss_function(lp_, labels_, umask)

        pred_ = torch.argmax(lp_, 1)  # batch*seq_len
        preds.append(pred_.data.cpu().numpy())
        labels.append(labels_.data.cpu().numpy())
        masks.append(umask.view(-1).cpu().numpy())

        losses.append(loss.item() * masks[-1].sum())
        if train:
            loss.backward()
            optimizer.step()
        else:
            alphas += alpha
            alphas_f += alpha_f
            alphas_b += alpha_b
            vids += data[-1]

    if preds != []:
        preds = np.concatenate(preds)
        labels = np.concatenate(labels)
        masks = np.concatenate(masks)
    else:
        return float('nan'), float('nan'), [], [], [], float('nan'), []

    avg_loss = round(np.sum(losses) / np.sum(masks), 4)
    avg_accuracy = round(
        accuracy_score(labels, preds, sample_weight=masks) * 100, 2)
    avg_fscore = round(
        f1_score(labels, preds, sample_weight=masks, average='weighted') * 100,
        2)
    return avg_loss, avg_accuracy, labels, preds, masks, avg_fscore, [
        alphas, alphas_f, alphas_b, vids
    ]