Python accuracy_score Beispiele, sklearn.metrics.accuracy_score Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: predict.py Projekt: Kimice/Recoba

    def fit_and_predict(self):
        print "****** train start ******"
        print "sample:", list(self.y_train).count(0), list(self.y_train).count(1)
        print "train percent is %s" % (float(list(self.y_train).count(1)) / len(list(self.y_train)))
        start_at = time.time()

        x_train_features = self.scaler.fit_transform(self.x_train)
        # x_train_features = self.x_train
        self.classifier.fit(x_train_features, self.y_train)

        print "train done. score %s. spend %s s" \
              % (self.classifier.score(
                  x_train_features, self.y_train), time.time() - start_at)

        x_test_features = self.scaler.fit_transform(self.x_test)
        # x_test_features = self.x_test
        y_test_pred = self.classifier.predict(x_test_features)
        print "****** test result *******"
        print y_test_pred, len(y_test_pred)
        print list(y_test_pred).count(0), list(y_test_pred).count(1),\
            set(y_test_pred), len(set(y_test_pred))
        print "test percent is %s" % (float(list(y_test_pred).count(1)) / len(list(y_test_pred)))
        print accuracy_score(self.y_test, y_test_pred)

        x_pred_features = self.scaler.fit_transform(self.p_x)
        # x_pred_features = self.p_x
        y_pred = self.classifier.predict(x_pred_features)
        print "****** predict result *******"
        print y_pred, len(y_pred)
        print list(y_pred).count(0), list(y_pred).count(1), set(y_pred), len(set(y_pred))
        print "predict percent is %s" % (float(list(y_pred).count(1)) / len(list(y_pred)))

        return y_pred

Beispiel #2

0

Datei anzeigen

Datei: sim_brian_MNIST_v7_parallel_GS.py Projekt: zhouyanasd/DL-NC

 def readout_sk(self, X_train, X_test, y_train, y_test, **kwargs):
     from sklearn.linear_model import LogisticRegression
     lr = LogisticRegression(**kwargs)
     lr.fit(X_train.T, y_train.T)
     y_train_predictions = lr.predict(X_train.T)
     y_test_predictions = lr.predict(X_test.T)
     return accuracy_score(y_train_predictions, y_train.T), accuracy_score(y_test_predictions, y_test.T)

Beispiel #3

0

Datei anzeigen

Datei: your_algorithm.py Projekt: fluxium/DAND-P5-Identify-Fraud-From-Enron-Email

def supportvector(C, gamma = 'default'):
    
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score
    
    if gamma == 'default':
        clf = SVC(kernel="rbf", C = C)
    else:
        clf = SVC(kernel="rbf", C = C, gamma = gamma)
    
    clf.fit(features_train, labels_train)
    
    t_fit = time()
    clf.fit(features_train, labels_train)
    print "training time:", round(time()-t_fit, 3), "s"
    
    t_pred = time()
    pred = clf.predict(features_test)
    print "predict time:", round(time()-t_pred, 3), "s"
    
    print accuracy_score(pred, labels_test)
    
    try:
        prettyPicture(clf, features_test, labels_test)
    except NameError:
        pass

Beispiel #4

0

Datei anzeigen

Datei: confusionMatrixTest.py Projekt: roylanceMichael/Partners_Obesity_Subjective_Feature_Learning

def main():
	f = open("me.stdout", "r").read()

	print f
	
	(confusionMatrix, labels, ytrue, ypred, trueCount) = readConfusionMatrix.readText(f)
	for row in confusionMatrix:
		print row

	precisionMicro = np.float(metrics.precision_score(ytrue, ypred, average="micro"))
	recallMicro = np.float(metrics.recall_score(ytrue, ypred, average="micro"))
	f1Micro = np.float(metrics.f1_score(ytrue, ypred, average="micro"))
	f1Macro = np.float(metrics.f1_score(ytrue, ypred, pos_label=1, average="macro"))
	precisionMacro = np.float(metrics.precision_score(ytrue, ypred, average="macro"))
	recallMacro = np.float(metrics.recall_score(ytrue, ypred, average="macro"))

	mConf = metrics.confusion_matrix(ytrue, ypred)
	print mConf

	print labels
	print len(ytrue)
	print len(ypred)
	print trueCount

	print metrics.accuracy_score(ytrue, ypred)

	print precisionMicro
	print recallMicro
	print f1Micro
	print f1Macro
	print precisionMacro
	print recallMacro

Beispiel #5

0

Datei anzeigen

Datei: sentiment_rf.py Projekt: stasi009/OpinionMining728

def test_one_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(n_estimators=500,
                                      max_depth=200,
                                      min_samples_split=10,
                                      oob_score=True,
                                      n_jobs=-1,verbose=1,class_weight='balanced')),
    ])

    ############# train
    pipeline.fit(Xtrain_raw,ytrain_raw)

    ############# check result
    rf = pipeline.steps[-1][1]
    rf.oob_score_

    ############# training error
    ytrain_predict = pipeline.predict(Xtrain_raw)
    print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict)
    print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict)

    ############# testing error
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = pipeline.predict(Xtest_raw)
    accuracy_score(y_true=ytest_raw,y_pred=ytest_predict)
    print classification_report(y_true=ytest_raw,y_pred=ytest_predict)

Beispiel #6

0

Datei anzeigen

Datei: linear_model.py Projekt: yvette-suyu/about-ML

    def score(self, K, y, sample_weight=None):
        """Returns the coefficient of determination R^2 of the prediction.

        The coefficient R^2 is defined as (1 - u/v), where u is the residual
        sum of squares ((y_true - y_pred) ** 2).sum() and v is the total
        sum of squares ((y_true - y_true.mean()) ** 2).sum().
        The best possible score is 1.0 and it can be negative (because the
        model can be arbitrarily worse). A constant model that always
        predicts the expected value of y, disregarding the input features,
        would get a R^2 score of 0.0.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Test samples.

        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
            True values for X.

        sample_weight : array-like, shape = [n_samples], optional
            Sample weights.

        Returns
        -------
        score : float
            R^2 of self.predict(X) wrt. y.
        """
        y_pred = self.predict(K)
        if sample_weight is None:
            return np.mean([accuracy_score(
                y[j], y_pred[j]) for j in range(len(K))])
        else:
            return np.mean([
                accuracy_score(y[j], y_pred[j], sample_weight=sample_weight[j])
                for j in range(len(K))])

Beispiel #7

0

Datei anzeigen

Datei: your_algorithm.py Projekt: fluxium/DAND-P5-Identify-Fraud-From-Enron-Email

def ranforest(n_estimators, min_samples_split):
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    
    clf = RandomForestClassifier(n_estimators = n_estimators, 
                                 min_samples_split = min_samples_split,
                                 bootstrap = True)
    clf.fit(features_train, labels_train)
    
    t_fit = time()
    clf.fit(features_train, labels_train)
    print "training time:", round(time()-t_fit, 3), "s"
    
    t_pred = time()
    pred = clf.predict(features_test)
    print "predict time:", round(time()-t_pred, 3), "s"
    
    print accuracy_score(pred, labels_test)

    
    try:
        prettyPicture(clf, features_test, labels_test)
    except NameError:
        pass

Beispiel #8

0

Datei anzeigen

Datei: random_forest.py Projekt: bharcode/MachineLearning

def main():
    """
    Main function
    :return:
    """
    # Load the csv file into pandas dataframe
    dataset = pd.read_csv(OUTPUT_PATH)
    # Get basic statistics of the loaded dataset
    dataset_statistics(dataset)

    # Filter missing values
    dataset = handel_missing_values(dataset, HEADERS[6], '?')
    train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[1:-1], HEADERS[-1])

    # Train and Test dataset size details
    print "Train_x Shape :: ", train_x.shape
    print "Train_y Shape :: ", train_y.shape
    print "Test_x Shape :: ", test_x.shape
    print "Test_y Shape :: ", test_y.shape

    # Create random forest classifier instance
    trained_model = random_forest_classifier(train_x, train_y)
    print "Trained model :: ", trained_model
    predictions = trained_model.predict(test_x)

    for i in xrange(0, 5):
        print "Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i])

    print "Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x))
    print "Test Accuracy  :: ", accuracy_score(test_y, predictions)
    print " Confusion matrix ", confusion_matrix(test_y, predictions)

Beispiel #9

0

Datei anzeigen

Datei: test.py Projekt: kimberninger/deepALC

 def on_epoch_end(self, epoch, logs={}):
     p = model.predict(X, verbose=1)
     #p = model.predict({'input': X}, verbose=1)['output']
     
     train_acc = accuracy_score(Y[:t1], np.round(p[:t1]))
     dev_acc = accuracy_score(Y[t1:t2], np.round(p[t1:t2]))
     test_acc = accuracy_score(Y[t2:], np.round(p[t2:]))
     print('Accuracy | train:', train_acc, 'dev:', dev_acc, 'test:', test_acc)
     
     train_recall = (recall_score(Y[:t1], np.round(p[:t1])) + recall_score(Y[:t1], np.round(p[:t1]), pos_label=0)) / 2
     dev_recall = (recall_score(Y[t1:t2], np.round(p[t1:t2])) + recall_score(Y[t1:t2], np.round(p[t1:t2]), pos_label=0)) / 2
     test_recall = (recall_score(Y[t2:], np.round(p[t2:])) + recall_score(Y[t2:], np.round(p[t2:]), pos_label=0)) / 2
     print('Recall   | train:', train_recall, 'dev:', dev_recall, 'test:', test_recall)
     
     self.accuracy[epoch, :] = np.array([train_acc, dev_acc, test_acc])
     self.recall[epoch, :] = np.array([train_recall, dev_recall, test_recall])
     
     plt.clf()
     plt.subplot(211)
     lines = plt.plot(range(1, epoch+2), self.accuracy[:epoch+1])
     plt.legend(iter(lines), ('train', 'dev', 'test'), loc=4)
     plt.xlabel('Epoch')
     plt.ylabel('Accuracy')
     plt.axis([1, epoch+2, 0, 1])
     plt.subplot(212)
     lines = plt.plot(range(1, epoch+2), self.recall[:epoch+1])
     plt.legend(iter(lines), ('train', 'dev', 'test'), loc=4)
     plt.xlabel('Epoch')
     plt.ylabel('Average recall')
     plt.axis([1, epoch+2, 0, 1])
     plt.savefig('results.png')

Beispiel #10

0

Datei anzeigen

Datei: nn.py Projekt: Dsinghbailey/futures_predictor

def feature_scaled_nn_acc(mds, type):
    train, validation = validation_split(mds)
    # Multiply by 1 to convert to bool
    y_train = train['Up'] * 1
    X_train = train.drop('Up', axis=1)
    y_validation = validation['Up'] * 1
    X_validation = validation.drop('Up', axis=1)
    pre = PCA(n_components=19, whiten=True)
    X_train_pca = pre.fit_transform(X_train)
    X_validation_pca = pre.fit_transform(X_validation)
    model = create_model(X_train_pca.shape[1], type)
    # Convert to Keras format
    y_train = to_categorical(y_train.values)
    y_validation = to_categorical(y_validation.values)
    model.fit(X_train_pca, y_train, nb_epoch=5, batch_size=16)
    time.sleep(0.1)
    # Fit and guess
    guess_train = model.predict_classes(X_train_pca)
    guess_train = to_categorical(guess_train)

    guess_validation = model.predict_classes(X_validation_pca)
    guess_validation = to_categorical(guess_validation)

    train_acc = accuracy_score(y_train, guess_train)
    validation_acc = accuracy_score(y_validation, guess_validation)
    print "\n neural net train accuracy is {}".format(train_acc)
    print "\n neural net validation accuracy is {}".format(validation_acc)
    return guess_validation

Beispiel #11

0

Datei anzeigen

Datei: nn.py Projekt: Dsinghbailey/futures_predictor

def nn_acc(mds, type, epoch=5, batch=16):
    train, validation = validation_split(mds)
    # Multiply by 1 to convert to bool
    y_train = train['Up'] * 1
    X_train = train.drop('Up', axis=1)
    y_validation = validation['Up'] * 1
    X_validation = validation.drop('Up', axis=1)
    # Create Model
    model = create_model(X_train.shape[1], type)
    # Convert to Keras format
    X_train = (X_train).as_matrix()
    X_validation = (X_validation).as_matrix()
    y_train = to_categorical(y_train.values)
    y_validation = to_categorical(y_validation.values)
    # Fit and guess
    model.fit(X_train, y_train, nb_epoch=epoch, batch_size=batch)
    guess_train = model.predict_classes(X_train)
    guess_train = to_categorical(guess_train)

    guess_validation = model.predict_classes(X_validation)
    guess_validation = to_categorical(guess_validation)

    train_acc = accuracy_score(y_train, guess_train)
    validation_acc = accuracy_score(y_validation, guess_validation)
    print "\n neural net train accuracy is {}".format(train_acc)
    print "\n neural net validation accuracy is {}".format(validation_acc)
    return guess_validation

Beispiel #12

0

Datei anzeigen

Datei: AdaBoost.py Projekt: AravindRam/ML

def Adaboost(TrainData,TestData):
    features=['Time','Season','Hour','Minute','District']

    clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30)

    size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    for i in range(0,len(size)):
        train,validation= train_test_split(TrainData, train_size=size[i])

        while len(set(train['Category'])) != len(set(validation['Category'])):
            train,validation= train_test_split(TrainData, train_size=size[i])
        clf = clf.fit(train[features], train['Category'])
        """stop = timeit.default_timer()
        print "Runnin  time adaboost is ", stop-start"""
        predicted=np.array(clf.predict_proba(validation[features]))
        model=clf.predict(train[features])
        model1=clf.predict(validation[features])

        #scores = cross_val_score(clf, validation[features], validation['Category'])
        #print "Scores mean is",scores.mean()
        #accuracy
        print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model)
        print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1)
        print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None)


        #writing to file
        """Category_new=[]

Beispiel #13

0

Datei anzeigen

Datei: main_simple.py Projekt: shirisimon/CFS_fMRI_analysis

def sub_analysis(subs, sub_data_generator):
        for sub in subs:
            print("process sub: %0.0f" % sub)
            # X_tr, y_tr, X_te, y_te, pipe = sub_data_generator.next()
            X, y, _, _, pipe = sub_data_generator.next()
            cv_scores  = []
            te1_scores = []
            te2_scores = []
            for state in range(3): #
                X_tr, X_te, y_tr, y_te = prep.stratified_split(X, y, state)
                # train on nmsk:
                algo, algo_params, score = prep.params.model
                clf = GridSearchCV(algo, algo_params, cv=3, scoring=score, n_jobs=-1, verbose=0)
                clf.fit(prep.convert(pipe, X_tr), y_tr)
                # test on msk:
                if prep.params.collapse_opacities:
                    X_te_high, y_te_high, X_te_low, y_te_low = prep.split_high_low_sets(X_te, y_te)
                    y_te_high = prep.collapse_lbls(y_te_high, (3,4,5), (0,1,2))
                    y_true_high, y_pred_high = y_te_high, clf.predict(prep.convert(pipe, X_te_high))
                    y_true_low,  y_pred_low  = y_te_low,  clf.predict(prep.convert(pipe, X_te_low))
                    cv_scores.append(clf.best_score_)
                    te1_scores.append(accuracy_score(y_true_high, y_pred_high))
                    te2_scores.append(accuracy_score(y_true_low, y_pred_low))
                else:
                    y_true, y_pred = y_te, clf.predict(prep.convert(pipe, X_te))
                    cv_scores.append(clf.best_score_)
                    te1_scores.append(accuracy_score(y_true, y_pred))
                    te2_scores = None
            yield cv_scores, te1_scores, te2_scores

Beispiel #14

0

Datei anzeigen

Datei: classification_smarket.py Projekt: skeydan/data-science-with-python

def assess_classification_performance(model, X_train, y_train, X_test, y_test, short = False):
  
    accuracy_train = metrics.accuracy_score(y_train, model.predict(X_train))
    accuracy_test = metrics.accuracy_score(y_test, model.predict(X_test))
    print('accuracy (train/test): {} / {}\n'.format(accuracy_train, accuracy_test))
    
    if not short:
    
      # confusion matrix
      # rows: actual group
      # columns: predicted group
      print('Confusion_matrix (training data):')
      print(metrics.confusion_matrix(y_train, model.predict(X_train)))
      
      print('Confusion_matrix (test data):')
      print(metrics.confusion_matrix(y_test, model.predict(X_test)))

      # precision =  tp / (tp + fp)
      # recall = tp / (tp + fn) (= sensitivity)
      # F1 = 2 * (precision * recall) / (precision + recall)
      print('\nPrecision - recall (training data):')
      print(metrics.classification_report(y_train, model.predict(X_train)))
      
      print('\nPrecision - recall (test data):')
      print(metrics.classification_report(y_test, model.predict(X_test)))

Beispiel #15

0

Datei anzeigen

Datei: test_base.py Projekt: GameOfThrow/skflow

    def testIrisStreaming(self):
        iris = datasets.load_iris()

        def iris_data():
            while True:
                for x in iris.data:
                    yield x

        def iris_predict_data():
            for x in iris.data:
                yield x

        def iris_target():
            while True:
                for y in iris.target:
                    yield y

        classifier = skflow.TensorFlowLinearClassifier(n_classes=3, steps=100)
        classifier.fit(iris_data(), iris_target())
        score1 = accuracy_score(iris.target, classifier.predict(iris.data))
        score2 = accuracy_score(iris.target, classifier.predict(iris_predict_data()))
        self.assertGreater(score1, 0.5, "Failed with score = {0}".format(score1))
        self.assertEqual(score2, score1, "Scores from {0} iterator doesn't "
                                         "match score {1} from full "
                                         "data.".format(score2, score1))

Beispiel #16

0

Datei anzeigen

Datei: mnist.py Projekt: AliMiraftab/tensorflow

def main(unused_args):
  ### Download and load MNIST dataset.
  mnist = learn.datasets.load_dataset('mnist')

  ### Linear classifier.
  feature_columns = learn.infer_real_valued_columns_from_input(
      mnist.train.images)
  classifier = learn.LinearClassifier(
      feature_columns=feature_columns, n_classes=10)
  classifier.fit(mnist.train.images,
                 mnist.train.labels.astype(np.int32),
                 batch_size=100,
                 steps=1000)
  score = metrics.accuracy_score(mnist.test.labels,
                                 list(classifier.predict(mnist.test.images)))
  print('Accuracy: {0:f}'.format(score))

  ### Convolutional network
  classifier = learn.Estimator(model_fn=conv_model)
  classifier.fit(mnist.train.images,
                 mnist.train.labels,
                 batch_size=100,
                 steps=20000)
  score = metrics.accuracy_score(mnist.test.labels,
                                 list(classifier.predict(mnist.test.images)))
  print('Accuracy: {0:f}'.format(score))

Beispiel #17

0

Datei anzeigen

Datei: hack_ariba.py Projekt: tanejavick/hackathon

def main():
	input_train_file_ptr = sys.argv[1]
	input_test_file_ptr = sys.argv[2]
	# read the csv file and return the pandas dataframe with two column as tweets and sentiment as columns.
	train_tweests_with_sentiments = pre_process_input_data(input_train_file_ptr)
	test_tweets_data = pre_process_input_data(input_test_file_ptr)
	bigram_vectorizer = CountVectorizer(ngram_range=(2,2),token_pattern=r'\b\w+\b', min_df=1,lowercase=True)
	# print tweests_array
	tweets_array, sentiments_array = get_tweest_and_sentiments(train_tweests_with_sentiments)
	print("size of tweets array is %s and sentiment array is %s  " % (tweets_array.size, sentiments_array.size))
	test_tweets,test_sentiments = get_tweest_and_sentiments(test_tweets_data)
	test_sentiments =  test_sentiments.flatten()
	print("size of test tweets array is %s and test sentiment array is %s  " % (test_tweets.size, test_sentiments.size))
	parsed_train_tweets = clean_data_to_feed_classifier(tweets_array)
	parsed_test_tweets = clean_data_to_feed_classifier(test_tweets)
	# print parsed_tweests
	x = bigram_vectorizer.fit_transform(parsed_train_tweets)
	print x.size
	# print bigram_vectorizer.get_feature_names()
	bigram_vectorizer.build_analyzer()
	print "done 1"
	# print bigram_vectorizer.get_feature_names()
	res = bigram_vectorizer.transform(parsed_test_tweets)
	print "done 2"
	clf = LinearSVC()
	gnb = MultinomialNB()
	print "done 2"
	trained_classifier = do_K_fold_cross_validation(clf,gnb,x,sentiments_array.flatten())
	# trained_classifier.fit(x, sentiments_array.flatten())
	print "done 3"
	output =  trained_classifier.predict(res)
	# print output
	print accuracy_score(test_sentiments,output)

Beispiel #18

0

Datei anzeigen

Datei: RBFSolver.py Projekt: ftramer/Steal-ML

    def do(self, n_pts):
        X, y = self.collect_pts(n_pts)

        print 'done collecting points'

        rbf_map = RBFSampler(n_components=n_pts, random_state=1)
        solver = HyperSolver(p=self.POS, n=self.NEG)
        rbf_solver = pipeline.Pipeline([("mapper", rbf_map),
                                        ("solver", solver)])

        gamma_range = np.logspace(-15, 6, 22, base=2)
        param_grid = dict(mapper__gamma=gamma_range)
        cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=1)
        grid = GridSearchCV(rbf_solver, param_grid=param_grid, cv=cv, n_jobs=8)
        grid.fit(X, y)

        scores = [x[1] for x in grid.grid_scores_]
        scores = np.array(scores).reshape(len(gamma_range))
        plt.figure(figsize=(8, 6))
        plt.plot(gamma_range, scores)

        plt.xlabel('gamma')
        plt.ylabel('score')
        plt.title('Validation accuracy (RTiX, %s)' % os.path.basename(self.name))
        plt.savefig(self.name + '-SLViF-grid-npts=%d.pdf' % n_pts)

        # final train
        g = grid.best_params_['mapper__gamma']
        print 'best parameters are g=%f' % g
        rbf_svc2 = grid.best_estimator_
        y_pred = rbf_svc2.predict(self.Xt)
        print 'SCORE: %f' % sm.accuracy_score(self.Yt, y_pred)
        return grid.best_score_, sm.accuracy_score(self.Yt, y_pred)

Beispiel #19

0

Datei anzeigen

Datei: dbn_1svm.py Projekt: Siqi-Z/lstm_anomaly_thesis

def detect_anomalies():

    encoded_X_train = np.load("resources/files/encoded_X_train.npy")
    encoded_X_test = np.load("resources/files/encoded_X_test.npy")
    print(encoded_X_train.shape)
    print(encoded_X_test.shape)

    clf = svm.OneClassSVM(nu=0.1, kernel="linear")
    clf.fit(encoded_X_train)
    y_pred_train = clf.predict(encoded_X_train)
    y_pred_test = clf.predict(encoded_X_test)
    y_pred_outliers = clf.predict(np.full((100,hidden_dimensions[1]),4))

    # print y_pred_train[y_pred_train == -1].size
    # print y_pred_test[y_pred_test == -1].size
    # print y_pred_outliers[y_pred_outliers == -1].size

    # n_normal_points_test = X_test[y_pred_test == 1]
    # n_anomalies_test = X_test[y_pred_test == -1]
    # print(n_normal_points_test.shape)
    # print(n_anomalies_test.shape)

    print("Train Accuracy: %f"%(accuracy_score(Y_train, y_pred_train)))
    print("Test Accuracy: %f"%( accuracy_score(Y_test, y_pred_test)))
    print("Precision: %f" % (precision_score(Y_test, y_pred_test,pos_label=1)))
    #print("Recall: %f" % (precision_score(Y_test, y_pred_test, pos_label=-1)))
    print "Confusion Matrix: (Anomalies, Normal)"
    print confusion_matrix(Y_test,y_pred_test,labels=[-1,1])
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_test, pos_label=1)
    print "AUC: %f"%metrics.auc(fpr, tpr)

Beispiel #20

0

Datei anzeigen

Datei: LinearTrainer.py Projekt: ftramer/Steal-ML

    def grid_search(self):
        C_range = np.logspace(-5, 15, 21, base=2)
        param_grid = dict(C=C_range)
        cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(LinearSVC(dual=False, max_iter=10000), param_grid=param_grid,
                            cv=cv,
                            n_jobs=1, verbose=0)

        logger.info('start grid search for Linear')
        grid.fit(self.X_ex, self.y_ex)
        logger.info('end grid search for Linear')

        scores = [x[1] for x in grid.grid_scores_]

        # final train
        rbf_svc2 = grid.best_estimator_

        pred_train = rbf_svc2.predict(self.X_ex)
        pred_val = rbf_svc2.predict(self.val_x)
        pred_test = rbf_svc2.predict(self.test_x)

        r = Result(self.name + ' (X)', 'Linear', len(self.X_ex),
                   sm.accuracy_score(self.y_ex, pred_train),
                   sm.accuracy_score(self.val_y, pred_val),
                   sm.accuracy_score(self.test_y, pred_test))
        return r

Beispiel #21

0

Datei anzeigen

Datei: metrics.py Projekt: marrosenfeld/Machine-Learning

def getAccuracy(labels,predictedLabels, positive_label):

    totalExamples = labels.shape[0]
    accuracy = (getTP(labels,predictedLabels,positive_label) + getTN(labels,predictedLabels,positive_label)) / float(totalExamples)
    print ("Built-in accuracy = {}".format(metrics.accuracy_score(labels, predictedLabels)))
    print ("Accuracy = {}".format(accuracy))
    return metrics.accuracy_score(labels, predictedLabels)

Beispiel #22

0

Datei anzeigen

Datei: trainer.py Projekt: mircean/ML

    def train(self):
        X_train = self.dataset.df_train['ids_string'].values
        y_train = self.dataset.df_train['label'].values
        X_test = self.dataset.df_test['ids_string'].values
        y_test = self.dataset.df_test['label'].values

        print(datetime.datetime.now(), 'Vectorizing')
        if opt['bow_tfidf'] == False:
            self.cv = CountVectorizer(ngram_range=opt['bow_ngram_range'], min_df=opt['bow_min_df'])
            X_train = self.cv.fit_transform(X_train)
            X_test = self.cv.transform(X_test)
        else:
            self.tfidf = TfidfVectorizer(ngram_range=opt['bow_ngram_range'], min_df=opt['bow_min_df'])
            X_train = self.tfidf.fit_transform(X_train)
            X_test = self.tfidf.transform(X_test)

        #TODO: use sparse.vstack
        X_train = np.concatenate((X_train.todense(), self.dataset.df_train[self.dataset.features].values), axis=1)
        X_test = np.concatenate((X_test.todense(), self.dataset.df_test[self.dataset.features].values), axis=1)

        print(datetime.datetime.now(), 'Traing')
        self.lr = LogisticRegression()
        self.lr.fit(X_train, y_train)

        y_predict = self.lr.predict(X_train)
        accuracy_train = accuracy_score(y_train, y_predict)
        y_predict = self.lr.predict(X_test)
        accuracy_test = accuracy_score(y_test, y_predict)
        print(datetime.datetime.now(), (accuracy_train, accuracy_test))

Beispiel #23

0

Datei anzeigen

Datei: experiment4.py Projekt: yuchi1989/music-genre-classification-and-chord-sequence

def testsvm2():
    genre_list = ["classical", "jazz", "country", "pop", "rock", "metal"]
    Xtrain,ytrain,Xtest,ytest = getSplitData()
    Xtrain, Xtest = getScaledData(Xtrain, Xtest)
    traindata = Xtrain
    trainlabel = ytrain
    testdata = Xtest
    testlabel = ytest
    X = np.vstack((Xtrain,Xtest))
    y = np.hstack((ytrain,ytest))
    accuracy1 = []
    
    kf = KFold(600, n_folds=10)
    for train, test in kf:
        #print("%s %s" % (train, test))
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        traindata = X_train
        trainlabel = y_train
        testdata = X_test
        testlabel = y_test
        clf = SVC(C = 1, gamma = 0.125, kernel = 'rbf')
        clf.fit(traindata, trainlabel)
        pred = clf.predict(testdata)
        print "svm classification accuracy: ", accuracy_score(testlabel,pred)
        cm = confusion_matrix(testlabel, pred)
        print cm
        plot_confusion_matrix(cm,genre_list)
        accuracy1.append(accuracy_score(testlabel,pred))
    print np.mean(accuracy1)

Beispiel #24

0

Datei anzeigen

Datei: 15-6.py Projekt: DemonZeros/1book

def main(unused_argv):


    x,y=load_data()

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

    vp = learn.preprocessing.VocabularyProcessor(max_document_length=MAX_DOCUMENT_LENGTH, min_frequency=1)

    x_train = np.array(list(vp.fit_transform(x_train)))
    x_test = np.array(list(vp.transform(x_test)))
    n_words=len(vp.vocabulary_)
    print('Total words: %d' % n_words)

    gnb = GaussianNB()
    y_predict = gnb.fit(x_train, y_train).predict(x_test)
    score = metrics.accuracy_score(y_test, y_predict)
    print('NB Accuracy: {0:f}'.format(score))

    feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(x_train)
    classifier = tf.contrib.learn.DNNClassifier(
        feature_columns=feature_columns, hidden_units=[500,10], n_classes=2)

    classifier.fit(x_train, y_train, steps=5000, batch_size=10)
    y_predict=list(classifier.predict(x_test, as_iterable=True))
    score = metrics.accuracy_score(y_test, y_predict)
    print('DNN Accuracy: {0:f}'.format(score))

Beispiel #25

0

Datei anzeigen

Datei: learnData.py Projekt: Mathieu-Seurin/dat-eeg

def getScores(y, yPredTrain, yTest, yPredTest):

    scores = dict()

    scores['f1Train'] = f1_score(y, yPredTrain)
    scores['f1Test'] = f1_score(yTest, yPredTest)


    scores['accTrain'] = accuracy_score(y, yPredTrain)
    scores['accTest'] = accuracy_score(yTest, yPredTest)
    

    scores['rocTrain'] = roc_auc_score(y, yPredTrain)
    scores['rocTest'] = roc_auc_score(yTest, yPredTest)
    

    scores['cMatrixTrain'] = confusion_matrix(y, yPredTrain)
    scores['cMatrixTest'] = confusion_matrix(yTest, yPredTest)

    proba = float(len(np.where(y==1)[0]))/len(y)
    if proba < 0.50:
        proba = 1 - proba
    scores['random'] = proba
    
    return scores

Beispiel #26

0

Datei anzeigen

Datei: main.py Projekt: diamondspark/Terrorist-Prediction

def main():
    #cleanAndWrite()

    df_gtd_imp = pd.read_pickle('df_gtd_imp.pkl')

    df_test = df_gtd_imp.query("gname == 'Unknown'")
    df_test = df_test.drop('gname',1)

    df_train = df_gtd_imp.query("gname != 'Unknown'")
    df_train_x = pd.DataFrame(df_train)
    df_train_x = df_train_x.drop('gname',1)
    df_train_y = df_train.gname

    print df_train_x.shape
    print df_train_y.shape
    print df_test.shape

    df_train_x, df_test = selectRelFeatures(df_train_x, df_train_y, df_test)

    print 'training model'
    X_train, X_test, y_train, y_test = \
            train_test_split(df_train_x, df_train_y, test_size=0.3, random_state=0)

    clf_rf = RandomForestClassifier(n_estimators=5)
    clf_rf.fit(X_train, y_train)
    y_pred = clf_rf.predict(X_test)
    print accuracy_score(y_test, y_pred)

Beispiel #27

0

Datei anzeigen

Datei: svmAuthorRec.py Projekt: neerajrao/hybrid-svm-author-attribution

def simple_classification_without_cross_fold_validation(x, y, estimator, scoring):
    '''
    Run normal SVM classification without cross-fold validation.
    '''

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 30% reserved for validation

    # feature selection since we have a small sample space
    fs = SelectPercentile(scoring, percentile=20)

    pipeline = Pipeline([('featureselector', fs), ('scaler', StandardScaler()), ('estimator', estimator)])

    pipeline = OneVsRestClassifier(pipeline)

    clfer = pipeline.fit(x_train, y_train)
    y_predict_train = clfer.predict(x_train)

    print "%% Accuracy on training set: %2.3f" % metrics.accuracy_score(y_train, y_predict_train)

    y_predict_test = clfer.predict(x_test)
    print "\n%% Accuracy on testing set: %2.3f" % metrics.accuracy_score(y_test, y_predict_test)

    print "\nClassification Report:"
    print metrics.classification_report(y_test, y_predict_test)

    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_predict_test)

Beispiel #28

0

Datei anzeigen

Datei: neuralNetworkClassifier.py Projekt: USCDataScience/NN-fileTypeDetection

	def testNetwork(self,X, Y):
		"""
		Test the neural network
		"""
		YPred = self.nn.predict(X)
		self.testError = 1 - accuracy_score(Y, YPred)
		print accuracy_score(Y, YPred)

Beispiel #29

0

Datei anzeigen

Datei: img_01_import_data_SFDD.py Projekt: bdanalytics/Kaggle-StateFarm-DistractedDrivers

def fitMdl(nFitObs = 50):
    mdl = linear_model.LogisticRegression(verbose = 1)
    mdl.fit(np.reshape(glbObsTrnFtr[0:nFitObs,:,:],                             (nFitObs, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])),                  glbObsTrnRsp[0:nFitObs])
    print mdl.get_params()
    print mdl.coef_.shape
    print '  coeff stats:'
    for lblIx in xrange(len(dspLabels)):
        print '  label:%s; minCoeff:row:%2d, col:%2d, value:%0.4f; maxCoeff:row:%2d, col:%2d, value:%0.4f;' %             (dspLabels[lblIx],              mdl.coef_[lblIx,:].argmin() / glbImgSz,              mdl.coef_[lblIx,:].argmin() % glbImgSz,              mdl.coef_[lblIx,:].min(),              mdl.coef_[lblIx,:].argmax() / glbImgSz,              mdl.coef_[lblIx,:].argmax() % glbImgSz,              mdl.coef_[lblIx,:].max())

    train_pred_labels = mdl.predict(np.reshape(glbObsTrnFtr[0:nFitObs,:,:],                                                     (nFitObs               , glbImgSz ** 2)))
    accuracy_train = metrics.accuracy_score(train_pred_labels, glbObsTrnRsp[0:nFitObs])
    print '  accuracy train:%0.4f' % (accuracy_train)
    print metrics.confusion_matrix(glbObsTrnRsp[0:nFitObs], train_pred_labels)

    valid_pred_labels = mdl.predict(np.reshape(glbObsVldFtr,                                                     (glbObsVldFtr.shape[0], glbImgSz ** 2)))
    accuracy_valid = metrics.accuracy_score(valid_pred_labels, glbObsVldRsp)
    print '  accuracy valid:%0.4f' % (accuracy_valid)
    print metrics.confusion_matrix(glbObsVldRsp           , valid_pred_labels)

    test_pred_labels  = mdl.predict(np.reshape(glbObsNewFtr,                                                     (glbObsNewFtr.shape[0], glbImgSz ** 2)))
    accuracy_test = metrics.accuracy_score( test_pred_labels,  glbObsNewRsp)
    print '  accuracy  test:%0.4f' % (accuracy_test)
    test_conf = pd.DataFrame(metrics.confusion_matrix( glbObsNewRsp,  test_pred_labels),                              index = dspLabels, columns = dspLabels)
    print test_conf
    
    return(mdl, (accuracy_train, accuracy_valid, accuracy_test))

Beispiel #30

0

Datei anzeigen

Datei: linear_svm.py Projekt: Veterun/SparkPythonHanhan

def main():
    indata = np.load(inputs)
    training_data = indata['data_training']
    training_scaled = preprocessing.scale(training_data)
    training_labels = indata['label_training']
    validation_data = indata['data_val']
    validation_scaled = preprocessing.scale(validation_data)
    validation_labels = indata['label_val']
    ts = range(-12,6)
    cs = [pow(10, t) for t in ts]
    accuracy_results = []
    accuracy_results_scaled = []

    for c in cs:
        lin_clf = svm.LinearSVC(C=c)
        lin_clf.fit(training_data, training_labels)
        predictions = lin_clf.predict(validation_data)
        accuracy = metrics.accuracy_score(validation_labels, predictions)
        accuracy_results.append(accuracy)

        lin_clf.fit(training_scaled, training_labels)
        predictions = lin_clf.predict(validation_scaled)
        accuracy_scaled = metrics.accuracy_score(validation_labels, predictions)
        accuracy_results_scaled.append(accuracy_scaled)

    plt.plot(range(len(cs)), accuracy_results, label='un-scaled')
    plt.plot(range(len(cs)), accuracy_results_scaled, label='scaled')
    plt.xticks(range(len(cs)), cs, size='small')
    plt.legend()
    plt.show()
    print accuracy_results
    print accuracy_results_scaled

Beispiel #31

0

Datei anzeigen

Datei: model2_predict.py Projekt: nikhilpsathyanathan/HDD

print("Model 2 prediction started")

data_train = pd.read_csv('model2.csv')  # read file generated by preprocess

missing_vals= data_train.isnull().sum().sort_values(ascending=False)
print(missing_vals)
print(data_train.shape[0])

print(data_train['failure'].value_counts())

Y = data_train['failure']                  # remove cols
serial=data_train['serial_number']
data_train.drop('failure', axis=1, inplace=True)
data_train.drop('serial_number', axis=1, inplace=True)

from imblearn.over_sampling import SMOTE   # Apply SMOTE
smt = SMOTE()
data_train, Y = smt.fit_sample(data_train, Y)
                                        #Apply test train split into partition test and train data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data_train,Y,test_size=0.5,random_state=3)  

from sklearn import tree        #use decision tree classifier 
clf = tree.DecisionTreeClassifier()
clf.fit(x_train,y_train)

y_pred=clf.predict(x_test)    #predicted values
from sklearn.metrics import confusion_matrix,accuracy_score
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))    #get Accuracy

Beispiel #32

0

Datei anzeigen

X_train, X_validation, Y_train, Y_validation = train_test_split(
    X, Y, test_size=0.2, random_state=seed)

results = []
names = []

models = [('LR', LogisticRegression()), ('LDA', LinearDiscriminantAnalysis()),
          ('KNN', KNeighborsClassifier()), ('CART', DecisionTreeClassifier()),
          ('SVM', SVC())]

for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model,
                                 X_train,
                                 Y_train,
                                 cv=kfold,
                                 scoring='roc_auc')
    results.append(cv_results)
    names.append(name)
    final_results = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(final_results)

knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)

print(" ")
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

Beispiel #33

0

Datei anzeigen

from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
le = preprocessing.LabelEncoder()
data = pd.read_csv("mining.csv")
print data.head()
for c in data.columns.values:
    if data[c].dtypes == 'object':
        le.fit(data[c].values)
        data[c] = le.transform(data[c])
data = data.as_matrix()
X = data[:, 1:10]
y = data[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)
yp = knn.predict(X_test)
print accuracy_score(yp, y_test)
print confusion_matrix(yp, y_test)

Beispiel #34

0

Datei anzeigen

Datei: naiveBayes.py Projekt: shravanyag/pentaho

from sklearn.metrics import accuracy_score
import pandas as pd

#read the csv files for both training and testing
ftrain = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/train.csv')
ltrain = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/trainVar.csv')
ftest = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/test.csv')
ltest = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/testVar.csv')

#creating data frames for training purposes
#x will take all the features and y will take the labels
x = ftrain.values.tolist()
y = list(ltrain['molecules'])

#creating data frames for testing purposes
#p will takeall the features and q will be the expected classification
p = ftest.values.tolist()
q = list(ltest['molecules'])

clf = GaussianNB()
clf.fit(x, y)
output = clf.predict(p)
print(output)
print(accuracy_score(q, output))


# In[ ]:

Beispiel #35

0

Datei anzeigen

baseline_acc = round(Y_test[Y_test == 0].shape[0] / Y_test.shape[0], 3)
print(f'The baseline accuracy of a naive model is {baseline_acc}')

# #### Random Forest Model

# I also build Random Forests in order to predict if a datapoint is a pick-up location or not. RF outperforms the baseline approach significantly.
#
# Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction.

# In[62]:

clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=0)
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
acc = round(accuracy_score(Y_test, Y_pred), 3)
print(f'The accuracy is {acc}')

# ##### Feature Importance

# We can also inspect and interpret the trained Random Forest classifier by analyzing the importance of each feature. Coordindates are the most important features to classify a pick-up point whereas the day feature does not help the classifier.

# In[63]:

importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")

Beispiel #36

0

Datei anzeigen

del csv[0]
# 데이터 셔플하기(섞기)
random.shuffle(csv)
# 학습 전용 데이터와 테스트 전용 데이터 분할하기 (2:1 비율)
total_len = len(csv)
train_len = int(total_len * 2 / 3)
train_data = []
train_label = []
test_data = []
test_label = []

for i in range(total_len) :
    data = csv[i][0:4]
    label = csv[i][4]
    if i < train_len :
        train_data.append(data)
        train_label.append(label)
    else :
        test_data.append(data)
        test_label.append(label)
# 데이터를 학습시키고 예측하기
clf = svm.SVC(gamma='auto')
clf.fit(train_data, train_label)
pre = clf.predict(test_data)
# 정답률 구하기
ac_score = metrics.accuracy_score(test_label, pre)

print("전체 데이터 수: %d" %total_len)
print("학습 전용 데이터 수: %d" %train_len)
print("테스트 데이터 수: %d" %(len(test_data)))
print("정답률 =", ac_score)

Beispiel #37

0

Datei anzeigen

Datei: test_accuracy.py Projekt: isabella232/ignite-2

    def _test(metric_device):
        metric_device = torch.device(metric_device)
        acc = Accuracy(is_multilabel=True, device=metric_device)

        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long()
        y = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long()
        acc.update((y_pred, y))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(y_pred.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

        acc.reset()
        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long()
        y = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long()
        acc.update((y_pred, y))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(y_pred.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)

        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
        # check that result is not changed
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

        # Batched Updates
        acc.reset()
        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long()
        y = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long()

        batch_size = 16
        n_iters = y.shape[0] // batch_size + 1

        for i in range(n_iters):
            idx = i * batch_size
            acc.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(y_pred.cpu())  # (N, C, L, ...) -> (N * L * ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, L, ...) -> (N * L ..., C)

        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

Beispiel #38

0

Datei anzeigen

Datei: 22.py Projekt: wlj123-233/hammond-as

#训练样本特征因子化
MONTH_t =pd.to_datetime(train['EVENT_DATE']).dt.month
MONTH_t = pd.get_dummies(MONTH_t)
number_t = pd.get_dummies(test['事件数'])
SHENFEN_t = pd.get_dummies(test['ADMIN1'])
test_set = pd.concat([MONTH_t,number,SHENFEN_t],axis=1)

x = train_set.loc[:,train_set.columns!='crime_type']
y = train_set['crime_type']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=1)

model = BernoulliNB()#伯努利模型
model.fit(x_train,y_train)
y_pred = model.predict(x_test)  #预测值#将预测值与实际值进行对比，输出精确度的值
print("伯努利model accuracy: ",metrics.accuracy_score(y_test,y_pred))
model_LR = LogisticRegression(C=0.1)#逻辑回归
model_LR.fit(x_train,y_train)
y_pred = model_LR.predict(x_test)
print("逻辑回归model accuracy: ",metrics.accuracy_score(y_test,y_pred))
model_RF = RandomForestClassifier()
model_RF.fit(x_train,y_train)
y_pred = model_RF.predict(x_test)
print("随机森林model accuracy: ",metrics.accuracy_score(y_test,y_pred))#随机森林

Beispiel #39

0

Datei anzeigen

Datei: learning_imagedata.py Projekt: ulillilu/MachineLearning-DeepLearning

#image data mechine learning
from sklearn import model_selection, svm, metrics
# CSV 파일을 읽어 들이고 가공(이미지 데이터의 각 픽셀을 실수 벡터로 치환)
def load_csv(fname):
    labels = []
    images = []
    with open(fname, "r") as f:
        for line in f:
            cols = line.split(",")
            if len(cols) < 2: continue
            labels.append(int(cols.pop(0)))
            vals = list(map(lambda n: int(n) / 256, cols))
            images.append(vals)
    return {"labels":labels, "images":images}

data = load_csv("./mnist/train.csv")
test = load_csv("./mnist/t10k.csv")

clf = svm.SVC()
clf.fit(data["images"], data["labels"])
predict = clf.predict(test["images"])
# 결과 확인
ac_score = metrics.accuracy_score(test["labels"], predict)
cl_report = metrics.classification_report(test["labels"], predict)
print("정답률 =", ac_score)
print("리포트 =")
print(cl_report)

Beispiel #40

0

Datei anzeigen

Datei: rf.py Projekt: chiragshahi/Yelp-Dataset-Challenge

    clf = GridSearchCV(RandomForestClassifier(),
                       tuned_parameters,
                       cv=5,
                       scoring='%s' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print("Detailed confusion matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("Accuracy Score: \n")
    print(accuracy_score(y_true, y_pred))

    print()

Beispiel #41

0

Datei anzeigen

X_train, X_test, y_train, y_test = train_test_split( X_data, Y_data, \
                                                    test_size = 0.3, \
                                                    train_size = 0.7, \
                                                    random_state = 100)

# Estimating the tree model - such as the ensemble comparison
RFC_Model = RandomForestClassifier(max_depth=20, random_state=100,\
                                   n_estimators=15)
RFC_Model.fit(X_train, y_train.ravel())
# print(RFC_Model.feature_importances_)

# Creating the model for automatic web api
pk.dump(RFC_Model, open('model.pkl', 'wb'))

# Predicting the output data
y_RFC_pred = RFC_Model.predict(X_test)

# Performance measure
accuracy_score(y_test, y_RFC_pred) * 100

# Another one
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_RFC_pred)
acc = 100 * (cm[0, 0] + cm[1, 1]) / (cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1])

print(acc)

# Predicting all data classification
y_RFC_pred_train = RFC_Model.predict(X_train)
accuracy_score(y_train, y_RFC_pred_train) * 100

Beispiel #42

0

Datei anzeigen

Datei: logistic_regression.py Projekt: SuchismiBhattacharya/Machine-Learning

#import pandas
import pandas as pd
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# load dataset
pima = pd.read_csv("diabetess.txt", header=None, names=col_names)
print(pima.head())
#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima.label # Target variable
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=45, solver='liblinear', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)

# fit the model with data
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Beispiel #43

0

Datei anzeigen

Datei: train.py Projekt: R-aryan/Sentiment_Analysis_BERT

def run():
    #reading the dataseet
    dfx= pd.read_csv(config.TRAINING_FILE).fillna("none")

    dfx.sentiment= dfx.sentiment.apply(
        lambda x:1 if x=="positive" else 0
    )

    

    #splitting into training and validation set
    df_train,df_valid= model_selection.train_test_split(
        dfx,
        test_size=0.1,
        random_state=42,
        stratify= dfx.sentiment.values
    )

    df_train= df_train.reset_index(drop=True)
    df_valid= df_valid.reset_index(drop=True)

    train_dataset=dataset.BERTDataset(
        review=df_train.review.values,
        target=df_train.sentiment.values

    )

    train_data_loader= torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )


    valid_dataset = dataset.BERTDataset(
        review=df_valid.review.values,
        target=df_valid.sentiment.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    cuda= torch.cuda.is_available()

    if cuda:
        device= torch.device("cuda")

    else:
        device= torch.device("cpu")

    model= BERTBaseUncased()
    model.to(device)


    param_optimizer=list(model.named_parameters())
    no_decay=["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters=[
        {'params':[p for n , p in param_optimizer if not any(nd in n for nd in no_decay)],'weigt_decay':0.001},
        {'params':[p for n , p in param_optimizer if  any(nd in n for nd in no_decay)],'weigt_decay':0.001}
    ]


    # print("Printing optimizer parameters......******")
    # print(optimizer_parameters)
    # print("Printing optimizer parameters......******")

    num_train_steps= int(len(df_train)/config.TRAIN_BATCH_SIZE*config.EPOCHS)

    optimizer=AdamW(optimizer_parameters,lr=3e-5)

    schedular=get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    best_accuracy=0

    for epochs in range(config.EPOCHS):

        #calling the training function in engine.py file
        engine.train_fn(train_data_loader,model,optimizer,device,schedular)
        
        #calling the evaluation function from the engine.py file to compute evaluation
        outputs,targets=engine.eval_fn(valid_data_loader,model,device)

        outputs=np.array(outputs)>=0.5

        #calculating the accuracy after every epoch
        accuracy=metrics.accuracy_score(targets,outputs)
        print(f"Accuracy Score = {accuracy}")
        
        #updating the accuracy
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy

Beispiel #44

0

Datei anzeigen

Datei: logicticreg.py Projekt: Priyabrata017/Classification-of-iris-species-using-Machine-Learning

import numpy as np
import pandas as pd
from sklearn import datasets
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
X = data.iloc[:, :-1].values
y = data.iloc[:, 4].values

from sklearn.cross_validation import train_test_split
# Splitting the data for training(70%) and testing(30%)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print("The training data is\n", X_train)
print("The testing data is\n", X_test)
print("The expected result is\n", y_test)
# By using LogisticRegression

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
# Training the dataset using Logistic Regression Model
clf.fit(X_train, y_train)
# Prediction

prediction = clf.predict(X_test)
print("The prediction by the machine is\n", prediction)
from sklearn.metrics import accuracy_score
# Finding the accuracy of the model

a = accuracy_score(y_test, prediction)
print("The accuracy of the model is:", a)

Beispiel #45

0

Datei anzeigen

    def forward(self, inputs, mask=None):
        output = self.gcn_1(inputs, mask=mask)
        output = self.dropout(output)
        output = self.gcn_2([output] + inputs[1:], mask=mask)
        return output


if __name__ == "__main__":
    model = GraphClassifier(A[0].shape[0], HIDDEN, output_dimension, BASES, DO,
                            len(A))
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=L2)
    criterion = nn.CrossEntropyLoss()
    X = sparse.csr_matrix(A[0].shape).todense()
    for epoch in range(NB_EPOCH):
        t = time.time()
        output = model([X] + A)
        gold = y_train[idx_train].argmax(dim=-1)
        loss = criterion(output[idx_train], gold)
        score = accuracy_score(output[idx_train].argmax(dim=-1), gold)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print("train_accuracy:", score, "loss:,", loss.item(), "time:",
              time.time() - t)
        test_gold = y_test[idx_test].argmax(dim=-1)
        test_output = output[idx_test]
        test_score = accuracy_score(test_output.argmax(dim=-1), test_gold)
        test_loss = criterion(test_output, test_gold)
        print("test_accuracy:", test_score, "loss:", test_loss.item())

Beispiel #46

0

Datei anzeigen

Datei: pranav.py Projekt: prajapati-sachin/Decision-Tree-Neural-Net

		Change_W,Change_B = BackPropagation(MATY,MATY_o,Netj,Oj,WEIGHTS)

		# print(len(WEIGHTS))
		# print(len(Change_W))
		for i in range(0,len(WEIGHTS)):
			WEIGHTS[i] -= alpha*Change_W[len(WEIGHTS)-i-1]
			BIAS[i] -= alpha*Change_B[len(WEIGHTS)-i-1]
		# print(batch)

		# break;
	print(epoch)


end3 = time.time()

y_pred = prediction(MATX1,WEIGHTS,BIAS)


accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
confusion_matrix = confusion_matrix(y_test,y_pred)
print(confusion_matrix)
end4 = time.time()


print("reading time" , end2-start)
print("training time" , end3-end2)
print("testing time" , end4-end3)

Beispiel #47

0

Datei anzeigen

Datei: titanic-survival-prediction.py Projekt: nischalshrestha/automatic_wat_discovery


# In[ ]:


for clf in classifiers:
    print("="*30)
    
    clf_name = clf.__class__.__name__
    print(clf_name)

    clf.fit(x_train, y_train)
    
    #Training Accuracy
    y_train_pred = clf.predict(x_train)
    train_acc = accuracy_score(y_train, y_train_pred)
    
    #Validation Accuracy
    y_valid_pred = clf.predict(x_valid)
    valid_acc = accuracy_score(y_valid, y_valid_pred)
    
    print("Validation Accuracy: {:.4%}".format(valid_acc))
    
    log_entry = pd.DataFrame([[clf_name, train_acc, valid_acc]], columns=log_cols)
    log = log.append(log_entry)


# In[ ]:


log.sort_values('Validation Accuracy', ascending=True).plot.barh(x='Classifier', y='Validation Accuracy', figsize=(16,7))

Beispiel #48

0

Datei anzeigen

Datei: project.py Projekt: sravani-k17/EEG-classification

  
print(X.head(n=20))

print(Y.head())



# Split the 'features' and 'income' data into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y,test_size = 0.2,random_state = 0)


clf = SVC()
clf.fit(X_train1, y_train1)
predicted = clf.predict(X_test1)

print("Accuracy = {}\nPrecision = {}\nRecall = {}\nF1 Score = {}".format(metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted)))

score_p.append([metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted)]) 

print(confusion_matrix(y_test1, predicted))


neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(X_train1, y_train1) 
predicted = neigh.predict(X_test1)
print("Accuracy = {}\nPrecision = {}\nRecall = {}\nF1 Score = {}".format(metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted)))

score_p.append([metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted)]) 

print(confusion_matrix(y_test1, predicted))

Beispiel #49

0

Datei anzeigen

Datei: Coursework2_CST4050.py Projekt: sushbahini/Analysis-of-Yelp-Dataset

# Importing the required libraries.
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import classification_report


# In[106]:


# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train,y_train)
predmnb = mnb.predict(x_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))


# **The performance score of Naive Bayes classifier is 86.06. Since it is high score, I will treat this model as my baseline.**

# # 5.4.2 Random Forest Classifier

# There is no correlation between our feature(text) and target(review_stars) and this is the reason for choosing Random Forest Classifier.
# The vital thing for a Random Forest Classifier model to make an accurate class prediction is the trees of the forest and more importantly their predictions need to be uncorrelated (or at least have low correlations with each other).
# 
# Random forests are an ensemble learning method for classification. It operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees.

# In[107]:

Beispiel #50

0

Datei anzeigen

def main():
    global start_dens
    st.title("Multi Variate Contamination in Fuel")

    menu = ["Home", "Model Training", "Contaminant Prediction"]
    choice = st.sidebar.selectbox("Menu", menu)

    if choice == "Home":
        st.subheader("Home")
        image_file = st.file_uploader("Upload Image",
                                      type=['png', 'jpeg', 'jpg'])
        if image_file is not None:
            # To See Details
            # st.write(type(image_file))
            # st.write(dir(image_file))
            file_details = {
                "Filename": image_file.name,
                "FileType": image_file.type,
                "FileSize": image_file.size
            }
            st.write(file_details)

            img = load_image(image_file)
            st.image(img, width=250)

    if choice == "Model Training":
        if st.button("Start Train"):
            allResults = glob.glob(
                'batch31-38_with_target_diff_prx_2000tampered/*.csv',
                recursive=True)
            allResults = sorted(allResults, key=lambda x: (x.split("/")[-1]))
            #st.write(allResults)

            newpath1 = 'batch31-38_with_target_diff_prx_2000tampered/'
            # newpath1='/content/drive/MyDrive/OIL SAMPLES DATA1/'

            folder = newpath1  ## data directory
            tag = str('.csv')  ## format to import
            initString = '-'  ## string in csv file name to search for category (normal, sludge, water, together)
            fileList = directorySearch(folder, tag)
            # print(fileList)
            final_filelist = pd.DataFrame(
                index=range(0, len(fileList)),
                columns=['file', 'Target', 'file_dir', 'window_id'])
            for i in range(0, (len(fileList))):
                fileName = fileList[i]
                res1 = fileName.find(initString)
                if res1 == -1:
                    res1 = fileName.find('_')
                if res1 == -1:
                    print(res1)
                    res1 = 5
                c1 = int(res1 + 1)
                c5 = int(res1 + 12)

                wloc = fileName.rfind('W', c1, c5)
                sloc = fileName.rfind('S', c1, c5)
                tloc = fileName.rfind('T', c1, c5)
                finalCat = max([wloc, sloc, tloc])
                strCat = fileName[finalCat]
                # print(strCat)

                classLabel = int(0)
                if strCat == 'S':
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat
                    # print(fileName,'---Sludge')
                    classLabel = int(1)
                if strCat == 'W':
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat
                    # print(fileName,'---Water')
                    classLabel = int(2)
                if strCat == 'T':
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat
                    # print(fileName,'--- Mix')
                    classLabel = int(3)
                if strCat not in ['S', 'T', 'W']:
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat

                final_filelist['file_dir'][i] = allResults[i]
                final_filelist['window_id'][i] = i + 1

            ll = []
            for i, j in enumerate(final_filelist['file']):
                # print(i,j)
                head, tail = os.path.split(j)
                r1 = re.split('_', tail)
                r2 = re.split('-', r1[0])
                print(r2)
                # if len(r2)==3 and int(r2[1]) < 37 and int(r2[1])<37 and not  'A' in r1[0] :
                if len(r2) == 3 and 'A' not in r2[2]:
                    ll.append(tail)
                elif len(r2) == 2 and 'A' not in (r2[1]):
                    ll.append(tail)
                elif len(r2) == 4 and 'A' not in (r2[3]):
                    ll.append(tail)

            dff = pd.DataFrame({'file': ll})
            dff['file'].count()

            df4 = pd.DataFrame()
            c = 0
            # for i,j in enumerate(allResults):
            for i, j in enumerate(dff['file']):
                # print(i,j)
                df = pd.read_csv(
                    'batch31-38_with_target_diff_prx_2000tampered/' + j)
                head, tail = os.path.split(j)
                # print(i,df.shape[1])
                df4[tail] = (df['Pressure_tmp'].rolling(300).std())

            df9 = pd.DataFrame(index=range(0, len(df4.columns)),
                               columns=[
                                   'file', 'pre-trans_mean', 'trans_mean',
                                   'post-trans_mean', 'transient_width'
                               ])

            for z, col in enumerate(df4.columns):
                start = 0
                end = 0

                a = df4[col]

                b = a.quantile(0.7)  # threshold set here : 70 percentile
                x = df4[col] > b  # find values greater than threshold

                # print(a)
                for i, j in enumerate(a):
                    # print(i,j)
                    if j > b:  # find value greater than threshold
                        start = i  # get the position of value greater than threshold
                        break
                for k, l in enumerate(
                        a[start:]
                ):  # now start checking from position that was marked earlier
                    # print(i,j)
                    if l < b and abs(
                            k
                    ) > 200:  # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part)
                        end = start + k
                        break
                df9['file'][z] = col
                df9['pre-trans_mean'][z] = (df4[col].iloc[:start].mean())
                df9['trans_mean'][z] = (df4[col].iloc[start:end].mean())
                df9['post-trans_mean'][z] = (df4[col].iloc[end:].mean())
                if (end - start) > 0:
                    df9['transient_width'][z] = end - start
                else:
                    df9['transient_width'][z] = 0

            df5 = df4.describe().transpose()
            df5 = df5.reset_index()
            df10 = pd.merge(df9,
                            df5[['index', 'std', 'max']],
                            left_on='file',
                            right_on='index',
                            how='left')
            del df10['index']
            df10 = df10.set_index('file')

            df11 = pd.merge(df10,
                            final_filelist[['file', 'Target']],
                            left_on='file',
                            right_on='file',
                            how='left')
            df11 = df11.set_index('file')
            df11 = df11.astype({
                'pre-trans_mean': 'float64',
                'trans_mean': 'float64',
                'post-trans_mean': 'float64',
                'transient_width': 'float64'
            })

            df12 = pd.DataFrame()
            for i, j in enumerate(dff['file']):
                # print(i,j)
                df = pd.read_csv(
                    'batch31-38_with_target_diff_prx_2000tampered/' + j)
                head, tail = os.path.split(j)
                # print(i,df.shape[1])
                df12[tail] = (df['Density'].rolling(300).std())

            df13 = pd.DataFrame(index=range(0, len(df12.columns)),
                                columns=[
                                    'file', 'pre-trans_mean-density',
                                    'trans_mean-density',
                                    'post-trans_mean-density',
                                    'transient_width-density'
                                ])

            for z, col in enumerate(df12.columns):
                start = 0
                end = 0
                print(col)  # file name
                a = df12[col]

                b = a.quantile(0.7)  # threshold set here : 70 percentile
                x = df12[col] > b  # find values greater than threshold
                # print(a)
                for i, j in enumerate(a):
                    # print(i,j)
                    if j > b:  # find value greater than threshold
                        start = i  # get the position of value greater than threshold
                        break
                for k, l in enumerate(
                        a[start:]
                ):  # now start checking from position that was marked earlier
                    # print(i,j)
                    if l < b and abs(
                            k
                    ) > 200:  # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part)
                        end = start + k
                        break
                df13['file'][z] = col
                df13['pre-trans_mean-density'][z] = (
                    df12[col].iloc[:start].mean())
                df13['trans_mean-density'][z] = (
                    df12[col].iloc[start:end].mean())
                df13['post-trans_mean-density'][z] = (
                    df12[col].iloc[end:].mean())
                if (end - start) > 0:
                    df13['transient_width-density'][z] = end - start
                else:
                    df13['transient_width-density'][z] = 0
            df13 = df13.astype({
                'pre-trans_mean-density': 'float64',
                'trans_mean-density': 'float64',
                'post-trans_mean-density': 'float64',
                'transient_width-density': 'float64'
            })
            df11.drop(['std'], axis=1, inplace=True)

            df14 = df13[[
                'file', 'pre-trans_mean-density', 'post-trans_mean-density'
            ]]
            df14['pre-trans_mean-density'] = df14[
                'pre-trans_mean-density'].fillna(0)

            df11.dropna(inplace=True)

            le = preprocessing.LabelEncoder()
            df11['Target'] = le.fit_transform(df11['Target'])
            df11.loc[:, 'Target']

            df15 = df11.merge(df14, how='inner', on='file')
            del df15['file']
            df15 = df15[[
                'pre-trans_mean', 'trans_mean', 'post-trans_mean',
                'transient_width', 'max', 'pre-trans_mean-density',
                'post-trans_mean-density', 'Target'
            ]]
            st.write(df15)
            col = df15.columns

            features = col.tolist()
            feature = features[:-1]
            target = features[-1]

            # x=dff_tr.loc[:,feature].values
            # y=dff_tr.loc[:,target].values
            x = df15.loc[:, feature].values
            y = df15.loc[:, target].values
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=0.3, random_state=98)
            ost = SMOTE()
            os_data_X, os_data_y = ost.fit_resample(x_train, y_train)
            os_data_X = pd.DataFrame(data=os_data_X, columns=feature)
            os_data_y = pd.DataFrame(data=os_data_y, columns=['Target'])

            # print('After Oversampling:')
            os_data_X, os_data_y = ost.fit_resample(x_train, y_train)
            clf_rf_bal = RandomForestClassifier(n_estimators=10,
                                                random_state=99)
            clf_rf_bal = clf_rf_bal.fit(os_data_X, os_data_y)

            #from sklearn.inspection import permutation_importance
            #results = permutation_importance(clf_rf_bal, x, y, scoring='accuracy')
            #importance = results.importances_mean
            # summarize feature importance
            #print('using permutaiton feature importance')
            #for i, v in enumerate(importance):
            #    print('Feature: %0d, Score: %.5f' % (i, v))
            #importance = clf_rf_bal.feature_importances_
            # summarize feature importance
            #print('using feature importance')
            #for i, v in enumerate(importance):
            #    print('Feature: %0d, Score: %.5f' % (i, v))

            bal_cm = confusion_matrix(y_test, clf_rf_bal.predict(x_test))
            y_pred_bal = clf_rf_bal.predict(x_test)

            print('balanced classification report')
            cls_rpt = classification_report(y_test, y_pred_bal)
            st.write(f'classification report : {cls_rpt}')

            bal_ac = accuracy_score(y_test, clf_rf_bal.predict(x_test))
            st.write(f'accuracy score : {bal_ac}')

            filename = 'finalized_model1.pkl'
            pickle.dump(clf_rf_bal,
                        open(os.path.join(os.getcwd(), filename), 'wb'))

    if choice == "Contaminant Prediction":
        st.subheader("Dataset")
        data_file = st.file_uploader("Upload CSV", type=['csv'])
        if st.button("Process") and data_file is not None:
            file_details = {
                "Filename": data_file.name,
                "FileType": data_file.type,
                "FileSize": data_file.size
            }
            st.write(file_details)

            df = pd.read_csv(data_file)
            st.dataframe(df)

            tag = str('.csv')  ## format to import
            initString = '-'  ## string in csv file name to search for category (normal, sludge, water, together)
            fileName = data_file.name
            # print(fileList)
            final_filelist = pd.DataFrame(columns=['file', 'Target'])
            res1 = fileName.find(initString)
            if res1 == -1:
                res1 = fileName.find('_')
            if res1 == -1:
                print(res1)
                res1 = 5
            c1 = int(res1 + 1)
            c5 = int(res1 + 12)

            wloc = fileName.rfind('W', c1, c5)
            sloc = fileName.rfind('S', c1, c5)
            tloc = fileName.rfind('T', c1, c5)
            finalCat = max([wloc, sloc, tloc])
            strCat = fileName[finalCat]

            st.write(f'FileName:{fileName}')
            if strCat not in ['S', 'T', 'W']:
                strCat = 'No Contaminant'
                st.write('No Contaminant')
            if strCat in ['S', 'T', 'W']:
                st.write('Contaminant Exists')
            if strCat == 'S':
                st.write('Type of Contaminant: Sludge')
            if strCat == 'W':
                st.write('Type of Contaminant: Water')
            if strCat == 'T':
                st.write('Type of Contaminant: Sludge+Water')

            df4 = pd.DataFrame()
            df4['roll_std'] = df['Pressure_tmp'].rolling(300).std()
            df5 = df4.describe().transpose()
            df5 = df5.reset_index()
            maxx = df5['max'][0]
            df9 = pd.DataFrame(columns=[
                'file', 'pre_trans_mean', 'trans_mean', 'post_trans_mean',
                'transient_width'
            ])

            # for col in df4.columns:
            # end = 0
            # print(col)  # file name
            # a = df4[col]
            a = df4['roll_std']
            st.write(a)

            b = a.quantile(0.7)  # threshold set here : 70 percentile
            # print(b)
            st.write(b)
            # x = df4[col] > b
            x = df4['roll_std'] > b  # find values greater than threshold
            # print(x.value_counts())
            # print(a)
            st.write(x)
            for i, j in enumerate(a):
                # print(i,j)
                if j > b:  # find value greater than threshold
                    start = i  # get the position of value greater than threshold
                    break
            for k, l in enumerate(
                    a[start:]
            ):  # now start checking from position that was marked earlier
                # print(i,j)
                if l < b and abs(
                        k
                ) > 200:  # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part)
                    end = start + k
                    break

            file = data_file.name
            pre_trans_mean = (df4['roll_std'].iloc[:start].mean())
            trans_mean = (df4['roll_std'].iloc[start:end].mean())
            post_trans_mean = (df4['roll_std'].iloc[end:].mean())
            if (end - start) > 0:
                transient_width = end - start
            else:
                transient_width = 0

            df12 = pd.DataFrame()
            df12['roll_std_den'] = (df['Density'].rolling(300).std())

            df13 = pd.DataFrame()
            df13 = pd.DataFrame(columns=[
                'file', 'pre_trans_mean_dens', 'trans_mean_dens',
                'post_trans_mean_dens', 'transient_width_dens'
            ])

            # for col in df4.columns:
            # end = 0
            # print(col)  # file name
            # a = df4[col]
            p = df12['roll_std_den']

            q = p.quantile(0.7)  # threshold set here : 70 percentile
            # print(b)
            # st.write(b)
            # x = df4[col] > b
            xx = df12['roll_std_den'] > q  # find values greater than threshold
            # print(x.value_counts())
            # print(a)
            # st.write(xx)
            for i, j in enumerate(p):
                # print(i,j)
                if j > q:  # find value greater than threshold
                    start_dens = i  # get the position of value greater than threshold
                    break
            for k, l in enumerate(
                    p[start_dens:]):  # now start checking from position
                # print(i,j)
                if l < q and abs(
                        k) > 200:  # find values that are less than threshold
                    end_dens = start_dens + k
                    break

            pre_trans_mean_dens = df12['roll_std_den'].iloc[:start_dens].mean()
            trans_mean_dens = df12['roll_std_den'].iloc[
                start_dens:end_dens].mean()
            post_trans_mean_dens = df12['roll_std_den'].iloc[end_dens:].mean()
            if (end_dens - start_dens) > 0:
                transient_width_dens = end_dens - start_dens
            else:
                transient_width_dens = 0

            zz = {
                'file': file,
                'pre_trans_mean': pre_trans_mean,
                'trans_mean': trans_mean,
                'post_trans_mean': post_trans_mean,
                'pre_trans_mean_dens': pre_trans_mean_dens,
                'trans_mean_dens': trans_mean_dens,
                'post_trans_mean_dens': post_trans_mean_dens
            }

            pre_trans_mean1 = pre_trans_mean
            trans_mean1 = trans_mean
            post_trans_mean1 = post_trans_mean
            transient_width1 = transient_width
            max1 = maxx
            pre_trans_mean_dens1 = pre_trans_mean_dens
            post_trans_mean_dens1 = post_trans_mean_dens

            st.write(zz)
            # load the model from disk
            loaded_model = pickle.load(open('finalized_model1.pkl', 'rb'))

            result = loaded_model.predict([[
                pre_trans_mean1, trans_mean1, post_trans_mean1,
                transient_width1, max1, pre_trans_mean_dens1,
                post_trans_mean_dens1
            ]])

            if result == 0:
                st.write(f'Predicted Contaminant: Sludge')
            if result == 1:
                st.write(f'Predicted Contaminant: Water')
            if result == 2:
                st.write(f'Predicted Contaminant: Water+Sludge')
            if result == 3:
                st.write('No Contaminant')

Beispiel #51

0

Datei anzeigen

def scorer_01loss(estimator, X, y):
    y_pred = estimator.predict(X)
    return 1 - accuracy_score(y, y_pred)

Beispiel #52

0

Datei anzeigen

print(x)
x.columns = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth']
colormap = n.array(['red', 'lime', 'black'])  # giving color for each
y = p.DataFrame(iris.target)
print(y)
y.columns = ['Target']
plt.figure(figsize=(14, 7))
plt.scatter(x.sepallength, x.sepalwidth, c=colormap[y.Target], s=40)
plt.title("Sepal data before model")
plt.show()
plt.figure(figsize=(14, 7))
plt.scatter(x.petallength, x.petalwidth, c=colormap[y.Target])
plt.title("petal data before model")
plt.show()
model = KMeans(n_clusters=2)
model.fit(x)
centroids = model.cluster_centers_  #give the centroid
print("centroids", centroids)
labels = model.labels_  ##labels of result
print(labels)
plt.figure(figsize=(14, 7))
plt.scatter(x.petallength, x.petalwidth, c=colormap[labels])
plt.title("petal data after model")
plt.show()

pred_y = n.choose(labels, [1, 0, 2]).astype(n.int64)
#print(labels)
print(pred_y)
print(so.accuracy_score(y, pred_y))
print(so.confusion_matrix(y, pred_y))

Beispiel #53

0

Datei anzeigen

Datei: vgg19_weather_classification.py Projekt: madhuryabellamkonda95/weather_classification

"""confusion matrix"""

results_for_conf_matrix = model.predict(x_test)
results_for_conf_matrix

results = []
for i in range(len(results_for_conf_matrix)):
  results.append(list(results_for_conf_matrix[i]).index(max(results_for_conf_matrix[i])))
cm=confusion_matrix(y_test, results)
cm
class_names=['Cloudy' ,'Rain' ,'Sun_shine','Sunrise']

"""accuracy: 89.7"""

from sklearn.metrics import accuracy_score
accuracy_score(y_test, results)

"""confusion_matrix_without_normalisation"""

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):

    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    #Compute confusion matrix

Beispiel #54

0

Datei anzeigen

Datei: most_freq.py Projekt: violetguos/proj_invivo

def judge_model(x_train, x_test, y_train, y_test, model):
    print('-' * 20)
    print('Baseline Performance')
    print('-> Acc:', accuracy_score(y_train, model.predict(x_train)))
    print('-> Acc:', accuracy_score(y_test, model.predict(x_test)))

Beispiel #55

0

Datei anzeigen

Datei: knn.py Projekt: umartariq867/knn.py-

joblib.dump(classifier, 'knn_model.sav')

# some time later...

# load the model from disk
loaded_model = joblib.load('knn_model.sav')
print('Accuracy of loaded model')
result = loaded_model.score(X_test, y_test)
print(result)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(y_pred)
from sklearn.metrics import accuracy_score
print('Accuracy Score: ', accuracy_score(y_pred,
                                         y_test))  #y_pred is the output

from sklearn.metrics import f1_score
f1_metric = f1_score(y_test, y_pred, average='macro')
print("f1 score macro:", f1_metric)

from sklearn.metrics import f1_score
f1_metric_micro = f1_score(y_test, y_pred, average='micro')
print("f1 score micro:", f1_metric_micro)

#take input from the loaded model
input_sepal_length = float(input("Enter sepal length: "))
input_sepal_width = float(input("Enter sepal width:"))
input_petal_length = float(input("Enter petal Length: "))
input_petal_width = float(input("Enter petal width: "))
output = loaded_model.predict([[

Beispiel #56

0

Datei anzeigen

def scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return accuracy_score(y, y_pred)

Beispiel #57

0

Datei anzeigen

Datei: lr.py Projekt: lbg940131/224project

def get_accu(model, data, y_gt):
	y_pred = model.predict_proba(data)[:,1]
	y_class= (y_pred>0.5)
	accu = accuracy_score(y_gt, y_class)
	return accu

Beispiel #58

0

Datei anzeigen

Datei: SupervisedML_CapstoneProject_Twitter.py Projekt: Colinios/Python-Coding-Examples

train_data, test_data, train_labels, test_labels = train_test_split(
    all_tweets, labels, test_size=0.2, random_state=1)
print(len(train_data))
print(len(test_data))

#Making the Count Vectors
counter = CountVectorizer()
counter.fit(train_data)
train_counts = counter.transform(train_data)
test_counts = counter.transform(test_data)

print(train_data[3])
print(train_counts[3])

#Train and Test the Naive Bayes Classifier
classifier = MultinomialNB()
classifier.fit(train_counts, train_labels)
predictions = classifier.predict(test_counts)

#Evaluating Your Model via accuracy score (and confusion matrix)
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, predictions))
#Accuracy lies around 67.8%, which is ok, but not great

from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_labels, predictions))

#Testing the prediction function with own tweet
tweet = 'Pierre, the baquette is tasty'
tweet_counts = counter.transform([tweet])
print(classifier.predict(tweet_counts))

Beispiel #59

0

Datei anzeigen

Datei: project11.py Projekt: sanyogeetap232/Machine-Learning-Titanic-Data-Set-

# In[]

#trainn = train.drop(['Name','Age', 'Ticket', 'Fare', 'Cabin'], axis = 1)

# In[]
X = traincopy[['Pclass', 'Sex', 'SibSp', 'Parch', 'Age']]
Y = traincopy['Survived']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)
# In[]
for k in range(1, 21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    pred = knn.predict(X_test)
    print("Accuracy is", accuracy_score(Y_test, pred) * 100, "for k = ", k)
    k = k + 1

# In[]

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=8)

# In[]

knn.fit(X_train, Y_train)
knn.score(X_test, Y_test)

# In[]

pred = knn.predict(testcopy)

Beispiel #60

0

Datei anzeigen

Datei: wine_kmeans.py Projekt: niravsshaha/PICTLAB


target = dataset['Class']    

df=df.iloc[0:177,[1,12]]


sc=StandardScaler()
df=sc.fit_transform(df)

pca = PCA(n_components=2)
pca_x=pca.fit_transform(df)
pca_df = pd.DataFrame(data=pca_x,columns=['comp1','comp2'])


KModel = KMeans(n_clusters=3,random_state=2)
KModel.fit_predict(pca_df)
KModel.labels_


colormap = np.array(['Red','Blue','Green'])


z = plt.scatter(pca_df.comp1,pca_df.comp2,c = colormap[KModel.labels_])
KModel.labels_
accuracy_score(target,KModel.labels_)