Beispiel #1
0
    def fit_and_predict(self):
        print "****** train start ******"
        print "sample:", list(self.y_train).count(0), list(self.y_train).count(1)
        print "train percent is %s" % (float(list(self.y_train).count(1)) / len(list(self.y_train)))
        start_at = time.time()

        x_train_features = self.scaler.fit_transform(self.x_train)
        # x_train_features = self.x_train
        self.classifier.fit(x_train_features, self.y_train)

        print "train done. score %s. spend %s s" \
              % (self.classifier.score(
                  x_train_features, self.y_train), time.time() - start_at)

        x_test_features = self.scaler.fit_transform(self.x_test)
        # x_test_features = self.x_test
        y_test_pred = self.classifier.predict(x_test_features)
        print "****** test result *******"
        print y_test_pred, len(y_test_pred)
        print list(y_test_pred).count(0), list(y_test_pred).count(1),\
            set(y_test_pred), len(set(y_test_pred))
        print "test percent is %s" % (float(list(y_test_pred).count(1)) / len(list(y_test_pred)))
        print accuracy_score(self.y_test, y_test_pred)

        x_pred_features = self.scaler.fit_transform(self.p_x)
        # x_pred_features = self.p_x
        y_pred = self.classifier.predict(x_pred_features)
        print "****** predict result *******"
        print y_pred, len(y_pred)
        print list(y_pred).count(0), list(y_pred).count(1), set(y_pred), len(set(y_pred))
        print "predict percent is %s" % (float(list(y_pred).count(1)) / len(list(y_pred)))

        return y_pred
 def readout_sk(self, X_train, X_test, y_train, y_test, **kwargs):
     from sklearn.linear_model import LogisticRegression
     lr = LogisticRegression(**kwargs)
     lr.fit(X_train.T, y_train.T)
     y_train_predictions = lr.predict(X_train.T)
     y_test_predictions = lr.predict(X_test.T)
     return accuracy_score(y_train_predictions, y_train.T), accuracy_score(y_test_predictions, y_test.T)
def supportvector(C, gamma = 'default'):
    
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score
    
    if gamma == 'default':
        clf = SVC(kernel="rbf", C = C)
    else:
        clf = SVC(kernel="rbf", C = C, gamma = gamma)
    
    clf.fit(features_train, labels_train)
    
    t_fit = time()
    clf.fit(features_train, labels_train)
    print "training time:", round(time()-t_fit, 3), "s"
    
    t_pred = time()
    pred = clf.predict(features_test)
    print "predict time:", round(time()-t_pred, 3), "s"
    
    print accuracy_score(pred, labels_test)
    
    try:
        prettyPicture(clf, features_test, labels_test)
    except NameError:
        pass
def main():
	f = open("me.stdout", "r").read()

	print f
	
	(confusionMatrix, labels, ytrue, ypred, trueCount) = readConfusionMatrix.readText(f)
	for row in confusionMatrix:
		print row

	precisionMicro = np.float(metrics.precision_score(ytrue, ypred, average="micro"))
	recallMicro = np.float(metrics.recall_score(ytrue, ypred, average="micro"))
	f1Micro = np.float(metrics.f1_score(ytrue, ypred, average="micro"))
	f1Macro = np.float(metrics.f1_score(ytrue, ypred, pos_label=1, average="macro"))
	precisionMacro = np.float(metrics.precision_score(ytrue, ypred, average="macro"))
	recallMacro = np.float(metrics.recall_score(ytrue, ypred, average="macro"))

	mConf = metrics.confusion_matrix(ytrue, ypred)
	print mConf

	print labels
	print len(ytrue)
	print len(ypred)
	print trueCount

	print metrics.accuracy_score(ytrue, ypred)

	print precisionMicro
	print recallMicro
	print f1Micro
	print f1Macro
	print precisionMacro
	print recallMacro
def test_one_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(n_estimators=500,
                                      max_depth=200,
                                      min_samples_split=10,
                                      oob_score=True,
                                      n_jobs=-1,verbose=1,class_weight='balanced')),
    ])

    ############# train
    pipeline.fit(Xtrain_raw,ytrain_raw)

    ############# check result
    rf = pipeline.steps[-1][1]
    rf.oob_score_

    ############# training error
    ytrain_predict = pipeline.predict(Xtrain_raw)
    print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict)
    print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict)

    ############# testing error
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = pipeline.predict(Xtest_raw)
    accuracy_score(y_true=ytest_raw,y_pred=ytest_predict)
    print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
Beispiel #6
0
    def score(self, K, y, sample_weight=None):
        """Returns the coefficient of determination R^2 of the prediction.

        The coefficient R^2 is defined as (1 - u/v), where u is the residual
        sum of squares ((y_true - y_pred) ** 2).sum() and v is the total
        sum of squares ((y_true - y_true.mean()) ** 2).sum().
        The best possible score is 1.0 and it can be negative (because the
        model can be arbitrarily worse). A constant model that always
        predicts the expected value of y, disregarding the input features,
        would get a R^2 score of 0.0.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Test samples.

        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
            True values for X.

        sample_weight : array-like, shape = [n_samples], optional
            Sample weights.

        Returns
        -------
        score : float
            R^2 of self.predict(X) wrt. y.
        """
        y_pred = self.predict(K)
        if sample_weight is None:
            return np.mean([accuracy_score(
                y[j], y_pred[j]) for j in range(len(K))])
        else:
            return np.mean([
                accuracy_score(y[j], y_pred[j], sample_weight=sample_weight[j])
                for j in range(len(K))])
def ranforest(n_estimators, min_samples_split):
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    
    clf = RandomForestClassifier(n_estimators = n_estimators, 
                                 min_samples_split = min_samples_split,
                                 bootstrap = True)
    clf.fit(features_train, labels_train)
    
    t_fit = time()
    clf.fit(features_train, labels_train)
    print "training time:", round(time()-t_fit, 3), "s"
    
    t_pred = time()
    pred = clf.predict(features_test)
    print "predict time:", round(time()-t_pred, 3), "s"
    
    print accuracy_score(pred, labels_test)

    
    try:
        prettyPicture(clf, features_test, labels_test)
    except NameError:
        pass
def main():
    """
    Main function
    :return:
    """
    # Load the csv file into pandas dataframe
    dataset = pd.read_csv(OUTPUT_PATH)
    # Get basic statistics of the loaded dataset
    dataset_statistics(dataset)

    # Filter missing values
    dataset = handel_missing_values(dataset, HEADERS[6], '?')
    train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[1:-1], HEADERS[-1])

    # Train and Test dataset size details
    print "Train_x Shape :: ", train_x.shape
    print "Train_y Shape :: ", train_y.shape
    print "Test_x Shape :: ", test_x.shape
    print "Test_y Shape :: ", test_y.shape

    # Create random forest classifier instance
    trained_model = random_forest_classifier(train_x, train_y)
    print "Trained model :: ", trained_model
    predictions = trained_model.predict(test_x)

    for i in xrange(0, 5):
        print "Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i])

    print "Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x))
    print "Test Accuracy  :: ", accuracy_score(test_y, predictions)
    print " Confusion matrix ", confusion_matrix(test_y, predictions)
Beispiel #9
0
 def on_epoch_end(self, epoch, logs={}):
     p = model.predict(X, verbose=1)
     #p = model.predict({'input': X}, verbose=1)['output']
     
     train_acc = accuracy_score(Y[:t1], np.round(p[:t1]))
     dev_acc = accuracy_score(Y[t1:t2], np.round(p[t1:t2]))
     test_acc = accuracy_score(Y[t2:], np.round(p[t2:]))
     print('Accuracy | train:', train_acc, 'dev:', dev_acc, 'test:', test_acc)
     
     train_recall = (recall_score(Y[:t1], np.round(p[:t1])) + recall_score(Y[:t1], np.round(p[:t1]), pos_label=0)) / 2
     dev_recall = (recall_score(Y[t1:t2], np.round(p[t1:t2])) + recall_score(Y[t1:t2], np.round(p[t1:t2]), pos_label=0)) / 2
     test_recall = (recall_score(Y[t2:], np.round(p[t2:])) + recall_score(Y[t2:], np.round(p[t2:]), pos_label=0)) / 2
     print('Recall   | train:', train_recall, 'dev:', dev_recall, 'test:', test_recall)
     
     self.accuracy[epoch, :] = np.array([train_acc, dev_acc, test_acc])
     self.recall[epoch, :] = np.array([train_recall, dev_recall, test_recall])
     
     plt.clf()
     plt.subplot(211)
     lines = plt.plot(range(1, epoch+2), self.accuracy[:epoch+1])
     plt.legend(iter(lines), ('train', 'dev', 'test'), loc=4)
     plt.xlabel('Epoch')
     plt.ylabel('Accuracy')
     plt.axis([1, epoch+2, 0, 1])
     plt.subplot(212)
     lines = plt.plot(range(1, epoch+2), self.recall[:epoch+1])
     plt.legend(iter(lines), ('train', 'dev', 'test'), loc=4)
     plt.xlabel('Epoch')
     plt.ylabel('Average recall')
     plt.axis([1, epoch+2, 0, 1])
     plt.savefig('results.png')
Beispiel #10
0
def feature_scaled_nn_acc(mds, type):
    train, validation = validation_split(mds)
    # Multiply by 1 to convert to bool
    y_train = train['Up'] * 1
    X_train = train.drop('Up', axis=1)
    y_validation = validation['Up'] * 1
    X_validation = validation.drop('Up', axis=1)
    pre = PCA(n_components=19, whiten=True)
    X_train_pca = pre.fit_transform(X_train)
    X_validation_pca = pre.fit_transform(X_validation)
    model = create_model(X_train_pca.shape[1], type)
    # Convert to Keras format
    y_train = to_categorical(y_train.values)
    y_validation = to_categorical(y_validation.values)
    model.fit(X_train_pca, y_train, nb_epoch=5, batch_size=16)
    time.sleep(0.1)
    # Fit and guess
    guess_train = model.predict_classes(X_train_pca)
    guess_train = to_categorical(guess_train)

    guess_validation = model.predict_classes(X_validation_pca)
    guess_validation = to_categorical(guess_validation)

    train_acc = accuracy_score(y_train, guess_train)
    validation_acc = accuracy_score(y_validation, guess_validation)
    print "\n neural net train accuracy is {}".format(train_acc)
    print "\n neural net validation accuracy is {}".format(validation_acc)
    return guess_validation
Beispiel #11
0
def nn_acc(mds, type, epoch=5, batch=16):
    train, validation = validation_split(mds)
    # Multiply by 1 to convert to bool
    y_train = train['Up'] * 1
    X_train = train.drop('Up', axis=1)
    y_validation = validation['Up'] * 1
    X_validation = validation.drop('Up', axis=1)
    # Create Model
    model = create_model(X_train.shape[1], type)
    # Convert to Keras format
    X_train = (X_train).as_matrix()
    X_validation = (X_validation).as_matrix()
    y_train = to_categorical(y_train.values)
    y_validation = to_categorical(y_validation.values)
    # Fit and guess
    model.fit(X_train, y_train, nb_epoch=epoch, batch_size=batch)
    guess_train = model.predict_classes(X_train)
    guess_train = to_categorical(guess_train)

    guess_validation = model.predict_classes(X_validation)
    guess_validation = to_categorical(guess_validation)

    train_acc = accuracy_score(y_train, guess_train)
    validation_acc = accuracy_score(y_validation, guess_validation)
    print "\n neural net train accuracy is {}".format(train_acc)
    print "\n neural net validation accuracy is {}".format(validation_acc)
    return guess_validation
Beispiel #12
0
def Adaboost(TrainData,TestData):
    features=['Time','Season','Hour','Minute','District']

    clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30)

    size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    for i in range(0,len(size)):
        train,validation= train_test_split(TrainData, train_size=size[i])

        while len(set(train['Category'])) != len(set(validation['Category'])):
            train,validation= train_test_split(TrainData, train_size=size[i])
        clf = clf.fit(train[features], train['Category'])
        """stop = timeit.default_timer()
        print "Runnin  time adaboost is ", stop-start"""
        predicted=np.array(clf.predict_proba(validation[features]))
        model=clf.predict(train[features])
        model1=clf.predict(validation[features])

        #scores = cross_val_score(clf, validation[features], validation['Category'])
        #print "Scores mean is",scores.mean()
        #accuracy
        print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model)
        print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1)
        print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None)


        #writing to file
        """Category_new=[]
def sub_analysis(subs, sub_data_generator):
        for sub in subs:
            print("process sub: %0.0f" % sub)
            # X_tr, y_tr, X_te, y_te, pipe = sub_data_generator.next()
            X, y, _, _, pipe = sub_data_generator.next()
            cv_scores  = []
            te1_scores = []
            te2_scores = []
            for state in range(3): #
                X_tr, X_te, y_tr, y_te = prep.stratified_split(X, y, state)
                # train on nmsk:
                algo, algo_params, score = prep.params.model
                clf = GridSearchCV(algo, algo_params, cv=3, scoring=score, n_jobs=-1, verbose=0)
                clf.fit(prep.convert(pipe, X_tr), y_tr)
                # test on msk:
                if prep.params.collapse_opacities:
                    X_te_high, y_te_high, X_te_low, y_te_low = prep.split_high_low_sets(X_te, y_te)
                    y_te_high = prep.collapse_lbls(y_te_high, (3,4,5), (0,1,2))
                    y_true_high, y_pred_high = y_te_high, clf.predict(prep.convert(pipe, X_te_high))
                    y_true_low,  y_pred_low  = y_te_low,  clf.predict(prep.convert(pipe, X_te_low))
                    cv_scores.append(clf.best_score_)
                    te1_scores.append(accuracy_score(y_true_high, y_pred_high))
                    te2_scores.append(accuracy_score(y_true_low, y_pred_low))
                else:
                    y_true, y_pred = y_te, clf.predict(prep.convert(pipe, X_te))
                    cv_scores.append(clf.best_score_)
                    te1_scores.append(accuracy_score(y_true, y_pred))
                    te2_scores = None
            yield cv_scores, te1_scores, te2_scores
def assess_classification_performance(model, X_train, y_train, X_test, y_test, short = False):
  
    accuracy_train = metrics.accuracy_score(y_train, model.predict(X_train))
    accuracy_test = metrics.accuracy_score(y_test, model.predict(X_test))
    print('accuracy (train/test): {} / {}\n'.format(accuracy_train, accuracy_test))
    
    if not short:
    
      # confusion matrix
      # rows: actual group
      # columns: predicted group
      print('Confusion_matrix (training data):')
      print(metrics.confusion_matrix(y_train, model.predict(X_train)))
      
      print('Confusion_matrix (test data):')
      print(metrics.confusion_matrix(y_test, model.predict(X_test)))

      # precision =  tp / (tp + fp)
      # recall = tp / (tp + fn) (= sensitivity)
      # F1 = 2 * (precision * recall) / (precision + recall)
      print('\nPrecision - recall (training data):')
      print(metrics.classification_report(y_train, model.predict(X_train)))
      
      print('\nPrecision - recall (test data):')
      print(metrics.classification_report(y_test, model.predict(X_test)))
Beispiel #15
0
    def testIrisStreaming(self):
        iris = datasets.load_iris()

        def iris_data():
            while True:
                for x in iris.data:
                    yield x

        def iris_predict_data():
            for x in iris.data:
                yield x

        def iris_target():
            while True:
                for y in iris.target:
                    yield y

        classifier = skflow.TensorFlowLinearClassifier(n_classes=3, steps=100)
        classifier.fit(iris_data(), iris_target())
        score1 = accuracy_score(iris.target, classifier.predict(iris.data))
        score2 = accuracy_score(iris.target, classifier.predict(iris_predict_data()))
        self.assertGreater(score1, 0.5, "Failed with score = {0}".format(score1))
        self.assertEqual(score2, score1, "Scores from {0} iterator doesn't "
                                         "match score {1} from full "
                                         "data.".format(score2, score1))
Beispiel #16
0
def main(unused_args):
  ### Download and load MNIST dataset.
  mnist = learn.datasets.load_dataset('mnist')

  ### Linear classifier.
  feature_columns = learn.infer_real_valued_columns_from_input(
      mnist.train.images)
  classifier = learn.LinearClassifier(
      feature_columns=feature_columns, n_classes=10)
  classifier.fit(mnist.train.images,
                 mnist.train.labels.astype(np.int32),
                 batch_size=100,
                 steps=1000)
  score = metrics.accuracy_score(mnist.test.labels,
                                 list(classifier.predict(mnist.test.images)))
  print('Accuracy: {0:f}'.format(score))

  ### Convolutional network
  classifier = learn.Estimator(model_fn=conv_model)
  classifier.fit(mnist.train.images,
                 mnist.train.labels,
                 batch_size=100,
                 steps=20000)
  score = metrics.accuracy_score(mnist.test.labels,
                                 list(classifier.predict(mnist.test.images)))
  print('Accuracy: {0:f}'.format(score))
Beispiel #17
0
def main():
	input_train_file_ptr = sys.argv[1]
	input_test_file_ptr = sys.argv[2]
	# read the csv file and return the pandas dataframe with two column as tweets and sentiment as columns.
	train_tweests_with_sentiments = pre_process_input_data(input_train_file_ptr)
	test_tweets_data = pre_process_input_data(input_test_file_ptr)
	bigram_vectorizer = CountVectorizer(ngram_range=(2,2),token_pattern=r'\b\w+\b', min_df=1,lowercase=True)
	# print tweests_array
	tweets_array, sentiments_array = get_tweest_and_sentiments(train_tweests_with_sentiments)
	print("size of tweets array is %s and sentiment array is %s  " % (tweets_array.size, sentiments_array.size))
	test_tweets,test_sentiments = get_tweest_and_sentiments(test_tweets_data)
	test_sentiments =  test_sentiments.flatten()
	print("size of test tweets array is %s and test sentiment array is %s  " % (test_tweets.size, test_sentiments.size))
	parsed_train_tweets = clean_data_to_feed_classifier(tweets_array)
	parsed_test_tweets = clean_data_to_feed_classifier(test_tweets)
	# print parsed_tweests
	x = bigram_vectorizer.fit_transform(parsed_train_tweets)
	print x.size
	# print bigram_vectorizer.get_feature_names()
	bigram_vectorizer.build_analyzer()
	print "done 1"
	# print bigram_vectorizer.get_feature_names()
	res = bigram_vectorizer.transform(parsed_test_tweets)
	print "done 2"
	clf = LinearSVC()
	gnb = MultinomialNB()
	print "done 2"
	trained_classifier = do_K_fold_cross_validation(clf,gnb,x,sentiments_array.flatten())
	# trained_classifier.fit(x, sentiments_array.flatten())
	print "done 3"
	output =  trained_classifier.predict(res)
	# print output
	print accuracy_score(test_sentiments,output)
Beispiel #18
0
    def do(self, n_pts):
        X, y = self.collect_pts(n_pts)

        print 'done collecting points'

        rbf_map = RBFSampler(n_components=n_pts, random_state=1)
        solver = HyperSolver(p=self.POS, n=self.NEG)
        rbf_solver = pipeline.Pipeline([("mapper", rbf_map),
                                        ("solver", solver)])

        gamma_range = np.logspace(-15, 6, 22, base=2)
        param_grid = dict(mapper__gamma=gamma_range)
        cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=1)
        grid = GridSearchCV(rbf_solver, param_grid=param_grid, cv=cv, n_jobs=8)
        grid.fit(X, y)

        scores = [x[1] for x in grid.grid_scores_]
        scores = np.array(scores).reshape(len(gamma_range))
        plt.figure(figsize=(8, 6))
        plt.plot(gamma_range, scores)

        plt.xlabel('gamma')
        plt.ylabel('score')
        plt.title('Validation accuracy (RTiX, %s)' % os.path.basename(self.name))
        plt.savefig(self.name + '-SLViF-grid-npts=%d.pdf' % n_pts)

        # final train
        g = grid.best_params_['mapper__gamma']
        print 'best parameters are g=%f' % g
        rbf_svc2 = grid.best_estimator_
        y_pred = rbf_svc2.predict(self.Xt)
        print 'SCORE: %f' % sm.accuracy_score(self.Yt, y_pred)
        return grid.best_score_, sm.accuracy_score(self.Yt, y_pred)
Beispiel #19
0
def detect_anomalies():

    encoded_X_train = np.load("resources/files/encoded_X_train.npy")
    encoded_X_test = np.load("resources/files/encoded_X_test.npy")
    print(encoded_X_train.shape)
    print(encoded_X_test.shape)

    clf = svm.OneClassSVM(nu=0.1, kernel="linear")
    clf.fit(encoded_X_train)
    y_pred_train = clf.predict(encoded_X_train)
    y_pred_test = clf.predict(encoded_X_test)
    y_pred_outliers = clf.predict(np.full((100,hidden_dimensions[1]),4))

    # print y_pred_train[y_pred_train == -1].size
    # print y_pred_test[y_pred_test == -1].size
    # print y_pred_outliers[y_pred_outliers == -1].size

    # n_normal_points_test = X_test[y_pred_test == 1]
    # n_anomalies_test = X_test[y_pred_test == -1]
    # print(n_normal_points_test.shape)
    # print(n_anomalies_test.shape)

    print("Train Accuracy: %f"%(accuracy_score(Y_train, y_pred_train)))
    print("Test Accuracy: %f"%( accuracy_score(Y_test, y_pred_test)))
    print("Precision: %f" % (precision_score(Y_test, y_pred_test,pos_label=1)))
    #print("Recall: %f" % (precision_score(Y_test, y_pred_test, pos_label=-1)))
    print "Confusion Matrix: (Anomalies, Normal)"
    print confusion_matrix(Y_test,y_pred_test,labels=[-1,1])
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_test, pos_label=1)
    print "AUC: %f"%metrics.auc(fpr, tpr)
Beispiel #20
0
    def grid_search(self):
        C_range = np.logspace(-5, 15, 21, base=2)
        param_grid = dict(C=C_range)
        cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(LinearSVC(dual=False, max_iter=10000), param_grid=param_grid,
                            cv=cv,
                            n_jobs=1, verbose=0)

        logger.info('start grid search for Linear')
        grid.fit(self.X_ex, self.y_ex)
        logger.info('end grid search for Linear')

        scores = [x[1] for x in grid.grid_scores_]

        # final train
        rbf_svc2 = grid.best_estimator_

        pred_train = rbf_svc2.predict(self.X_ex)
        pred_val = rbf_svc2.predict(self.val_x)
        pred_test = rbf_svc2.predict(self.test_x)

        r = Result(self.name + ' (X)', 'Linear', len(self.X_ex),
                   sm.accuracy_score(self.y_ex, pred_train),
                   sm.accuracy_score(self.val_y, pred_val),
                   sm.accuracy_score(self.test_y, pred_test))
        return r
def getAccuracy(labels,predictedLabels, positive_label):

    totalExamples = labels.shape[0]
    accuracy = (getTP(labels,predictedLabels,positive_label) + getTN(labels,predictedLabels,positive_label)) / float(totalExamples)
    print ("Built-in accuracy = {}".format(metrics.accuracy_score(labels, predictedLabels)))
    print ("Accuracy = {}".format(accuracy))
    return metrics.accuracy_score(labels, predictedLabels)
Beispiel #22
0
    def train(self):
        X_train = self.dataset.df_train['ids_string'].values
        y_train = self.dataset.df_train['label'].values
        X_test = self.dataset.df_test['ids_string'].values
        y_test = self.dataset.df_test['label'].values

        print(datetime.datetime.now(), 'Vectorizing')
        if opt['bow_tfidf'] == False:
            self.cv = CountVectorizer(ngram_range=opt['bow_ngram_range'], min_df=opt['bow_min_df'])
            X_train = self.cv.fit_transform(X_train)
            X_test = self.cv.transform(X_test)
        else:
            self.tfidf = TfidfVectorizer(ngram_range=opt['bow_ngram_range'], min_df=opt['bow_min_df'])
            X_train = self.tfidf.fit_transform(X_train)
            X_test = self.tfidf.transform(X_test)

        #TODO: use sparse.vstack
        X_train = np.concatenate((X_train.todense(), self.dataset.df_train[self.dataset.features].values), axis=1)
        X_test = np.concatenate((X_test.todense(), self.dataset.df_test[self.dataset.features].values), axis=1)

        print(datetime.datetime.now(), 'Traing')
        self.lr = LogisticRegression()
        self.lr.fit(X_train, y_train)

        y_predict = self.lr.predict(X_train)
        accuracy_train = accuracy_score(y_train, y_predict)
        y_predict = self.lr.predict(X_test)
        accuracy_test = accuracy_score(y_test, y_predict)
        print(datetime.datetime.now(), (accuracy_train, accuracy_test))
def testsvm2():
    genre_list = ["classical", "jazz", "country", "pop", "rock", "metal"]
    Xtrain,ytrain,Xtest,ytest = getSplitData()
    Xtrain, Xtest = getScaledData(Xtrain, Xtest)
    traindata = Xtrain
    trainlabel = ytrain
    testdata = Xtest
    testlabel = ytest
    X = np.vstack((Xtrain,Xtest))
    y = np.hstack((ytrain,ytest))
    accuracy1 = []
    
    kf = KFold(600, n_folds=10)
    for train, test in kf:
        #print("%s %s" % (train, test))
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        traindata = X_train
        trainlabel = y_train
        testdata = X_test
        testlabel = y_test
        clf = SVC(C = 1, gamma = 0.125, kernel = 'rbf')
        clf.fit(traindata, trainlabel)
        pred = clf.predict(testdata)
        print "svm classification accuracy: ", accuracy_score(testlabel,pred)
        cm = confusion_matrix(testlabel, pred)
        print cm
        plot_confusion_matrix(cm,genre_list)
        accuracy1.append(accuracy_score(testlabel,pred))
    print np.mean(accuracy1)
Beispiel #24
0
def main(unused_argv):


    x,y=load_data()

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

    vp = learn.preprocessing.VocabularyProcessor(max_document_length=MAX_DOCUMENT_LENGTH, min_frequency=1)

    x_train = np.array(list(vp.fit_transform(x_train)))
    x_test = np.array(list(vp.transform(x_test)))
    n_words=len(vp.vocabulary_)
    print('Total words: %d' % n_words)

    gnb = GaussianNB()
    y_predict = gnb.fit(x_train, y_train).predict(x_test)
    score = metrics.accuracy_score(y_test, y_predict)
    print('NB Accuracy: {0:f}'.format(score))

    feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(x_train)
    classifier = tf.contrib.learn.DNNClassifier(
        feature_columns=feature_columns, hidden_units=[500,10], n_classes=2)

    classifier.fit(x_train, y_train, steps=5000, batch_size=10)
    y_predict=list(classifier.predict(x_test, as_iterable=True))
    score = metrics.accuracy_score(y_test, y_predict)
    print('DNN Accuracy: {0:f}'.format(score))
Beispiel #25
0
def getScores(y, yPredTrain, yTest, yPredTest):

    scores = dict()

    scores['f1Train'] = f1_score(y, yPredTrain)
    scores['f1Test'] = f1_score(yTest, yPredTest)


    scores['accTrain'] = accuracy_score(y, yPredTrain)
    scores['accTest'] = accuracy_score(yTest, yPredTest)
    

    scores['rocTrain'] = roc_auc_score(y, yPredTrain)
    scores['rocTest'] = roc_auc_score(yTest, yPredTest)
    

    scores['cMatrixTrain'] = confusion_matrix(y, yPredTrain)
    scores['cMatrixTest'] = confusion_matrix(yTest, yPredTest)

    proba = float(len(np.where(y==1)[0]))/len(y)
    if proba < 0.50:
        proba = 1 - proba
    scores['random'] = proba
    
    return scores
def main():
    #cleanAndWrite()

    df_gtd_imp = pd.read_pickle('df_gtd_imp.pkl')

    df_test = df_gtd_imp.query("gname == 'Unknown'")
    df_test = df_test.drop('gname',1)

    df_train = df_gtd_imp.query("gname != 'Unknown'")
    df_train_x = pd.DataFrame(df_train)
    df_train_x = df_train_x.drop('gname',1)
    df_train_y = df_train.gname

    print df_train_x.shape
    print df_train_y.shape
    print df_test.shape

    df_train_x, df_test = selectRelFeatures(df_train_x, df_train_y, df_test)

    print 'training model'
    X_train, X_test, y_train, y_test = \
            train_test_split(df_train_x, df_train_y, test_size=0.3, random_state=0)

    clf_rf = RandomForestClassifier(n_estimators=5)
    clf_rf.fit(X_train, y_train)
    y_pred = clf_rf.predict(X_test)
    print accuracy_score(y_test, y_pred)
def simple_classification_without_cross_fold_validation(x, y, estimator, scoring):
    '''
    Run normal SVM classification without cross-fold validation.
    '''

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 30% reserved for validation

    # feature selection since we have a small sample space
    fs = SelectPercentile(scoring, percentile=20)

    pipeline = Pipeline([('featureselector', fs), ('scaler', StandardScaler()), ('estimator', estimator)])

    pipeline = OneVsRestClassifier(pipeline)

    clfer = pipeline.fit(x_train, y_train)
    y_predict_train = clfer.predict(x_train)

    print "%% Accuracy on training set: %2.3f" % metrics.accuracy_score(y_train, y_predict_train)

    y_predict_test = clfer.predict(x_test)
    print "\n%% Accuracy on testing set: %2.3f" % metrics.accuracy_score(y_test, y_predict_test)

    print "\nClassification Report:"
    print metrics.classification_report(y_test, y_predict_test)

    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_predict_test)
	def testNetwork(self,X, Y):
		"""
		Test the neural network
		"""
		YPred = self.nn.predict(X)
		self.testError = 1 - accuracy_score(Y, YPred)
		print accuracy_score(Y, YPred)
def fitMdl(nFitObs = 50):
    mdl = linear_model.LogisticRegression(verbose = 1)
    mdl.fit(np.reshape(glbObsTrnFtr[0:nFitObs,:,:],                             (nFitObs, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])),                  glbObsTrnRsp[0:nFitObs])
    print mdl.get_params()
    print mdl.coef_.shape
    print '  coeff stats:'
    for lblIx in xrange(len(dspLabels)):
        print '  label:%s; minCoeff:row:%2d, col:%2d, value:%0.4f; maxCoeff:row:%2d, col:%2d, value:%0.4f;' %             (dspLabels[lblIx],              mdl.coef_[lblIx,:].argmin() / glbImgSz,              mdl.coef_[lblIx,:].argmin() % glbImgSz,              mdl.coef_[lblIx,:].min(),              mdl.coef_[lblIx,:].argmax() / glbImgSz,              mdl.coef_[lblIx,:].argmax() % glbImgSz,              mdl.coef_[lblIx,:].max())

    train_pred_labels = mdl.predict(np.reshape(glbObsTrnFtr[0:nFitObs,:,:],                                                     (nFitObs               , glbImgSz ** 2)))
    accuracy_train = metrics.accuracy_score(train_pred_labels, glbObsTrnRsp[0:nFitObs])
    print '  accuracy train:%0.4f' % (accuracy_train)
    print metrics.confusion_matrix(glbObsTrnRsp[0:nFitObs], train_pred_labels)

    valid_pred_labels = mdl.predict(np.reshape(glbObsVldFtr,                                                     (glbObsVldFtr.shape[0], glbImgSz ** 2)))
    accuracy_valid = metrics.accuracy_score(valid_pred_labels, glbObsVldRsp)
    print '  accuracy valid:%0.4f' % (accuracy_valid)
    print metrics.confusion_matrix(glbObsVldRsp           , valid_pred_labels)

    test_pred_labels  = mdl.predict(np.reshape(glbObsNewFtr,                                                     (glbObsNewFtr.shape[0], glbImgSz ** 2)))
    accuracy_test = metrics.accuracy_score( test_pred_labels,  glbObsNewRsp)
    print '  accuracy  test:%0.4f' % (accuracy_test)
    test_conf = pd.DataFrame(metrics.confusion_matrix( glbObsNewRsp,  test_pred_labels),                              index = dspLabels, columns = dspLabels)
    print test_conf
    
    return(mdl, (accuracy_train, accuracy_valid, accuracy_test))
Beispiel #30
0
def main():
    indata = np.load(inputs)
    training_data = indata['data_training']
    training_scaled = preprocessing.scale(training_data)
    training_labels = indata['label_training']
    validation_data = indata['data_val']
    validation_scaled = preprocessing.scale(validation_data)
    validation_labels = indata['label_val']
    ts = range(-12,6)
    cs = [pow(10, t) for t in ts]
    accuracy_results = []
    accuracy_results_scaled = []

    for c in cs:
        lin_clf = svm.LinearSVC(C=c)
        lin_clf.fit(training_data, training_labels)
        predictions = lin_clf.predict(validation_data)
        accuracy = metrics.accuracy_score(validation_labels, predictions)
        accuracy_results.append(accuracy)

        lin_clf.fit(training_scaled, training_labels)
        predictions = lin_clf.predict(validation_scaled)
        accuracy_scaled = metrics.accuracy_score(validation_labels, predictions)
        accuracy_results_scaled.append(accuracy_scaled)

    plt.plot(range(len(cs)), accuracy_results, label='un-scaled')
    plt.plot(range(len(cs)), accuracy_results_scaled, label='scaled')
    plt.xticks(range(len(cs)), cs, size='small')
    plt.legend()
    plt.show()
    print accuracy_results
    print accuracy_results_scaled
print("Model 2 prediction started")

data_train = pd.read_csv('model2.csv')  # read file generated by preprocess

missing_vals= data_train.isnull().sum().sort_values(ascending=False)
print(missing_vals)
print(data_train.shape[0])

print(data_train['failure'].value_counts())

Y = data_train['failure']                  # remove cols
serial=data_train['serial_number']
data_train.drop('failure', axis=1, inplace=True)
data_train.drop('serial_number', axis=1, inplace=True)

from imblearn.over_sampling import SMOTE   # Apply SMOTE
smt = SMOTE()
data_train, Y = smt.fit_sample(data_train, Y)
                                        #Apply test train split into partition test and train data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data_train,Y,test_size=0.5,random_state=3)  

from sklearn import tree        #use decision tree classifier 
clf = tree.DecisionTreeClassifier()
clf.fit(x_train,y_train)

y_pred=clf.predict(x_test)    #predicted values
from sklearn.metrics import confusion_matrix,accuracy_score
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))    #get Accuracy
Beispiel #32
0
X_train, X_validation, Y_train, Y_validation = train_test_split(
    X, Y, test_size=0.2, random_state=seed)

results = []
names = []

models = [('LR', LogisticRegression()), ('LDA', LinearDiscriminantAnalysis()),
          ('KNN', KNeighborsClassifier()), ('CART', DecisionTreeClassifier()),
          ('SVM', SVC())]

for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model,
                                 X_train,
                                 Y_train,
                                 cv=kfold,
                                 scoring='roc_auc')
    results.append(cv_results)
    names.append(name)
    final_results = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(final_results)

knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)

print(" ")
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
Beispiel #33
0
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
le = preprocessing.LabelEncoder()
data = pd.read_csv("mining.csv")
print data.head()
for c in data.columns.values:
    if data[c].dtypes == 'object':
        le.fit(data[c].values)
        data[c] = le.transform(data[c])
data = data.as_matrix()
X = data[:, 1:10]
y = data[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)
yp = knn.predict(X_test)
print accuracy_score(yp, y_test)
print confusion_matrix(yp, y_test)
Beispiel #34
0
from sklearn.metrics import accuracy_score
import pandas as pd

#read the csv files for both training and testing
ftrain = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/train.csv')
ltrain = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/trainVar.csv')
ftest = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/test.csv')
ltest = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/testVar.csv')

#creating data frames for training purposes
#x will take all the features and y will take the labels
x = ftrain.values.tolist()
y = list(ltrain['molecules'])

#creating data frames for testing purposes
#p will takeall the features and q will be the expected classification
p = ftest.values.tolist()
q = list(ltest['molecules'])

clf = GaussianNB()
clf.fit(x, y)
output = clf.predict(p)
print(output)
print(accuracy_score(q, output))


# In[ ]:



Beispiel #35
0
baseline_acc = round(Y_test[Y_test == 0].shape[0] / Y_test.shape[0], 3)
print(f'The baseline accuracy of a naive model is {baseline_acc}')

# #### Random Forest Model

# I also build Random Forests in order to predict if a datapoint is a pick-up location or not. RF outperforms the baseline approach significantly.
#
# Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction.

# In[62]:

clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=0)
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
acc = round(accuracy_score(Y_test, Y_pred), 3)
print(f'The accuracy is {acc}')

# ##### Feature Importance

# We can also inspect and interpret the trained Random Forest classifier by analyzing the importance of each feature. Coordindates are the most important features to classify a pick-up point whereas the day feature does not help the classifier.

# In[63]:

importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
Beispiel #36
0
del csv[0]
# 데이터 셔플하기(섞기)
random.shuffle(csv)
# 학습 전용 데이터와 테스트 전용 데이터 분할하기 (2:1 비율)
total_len = len(csv)
train_len = int(total_len * 2 / 3)
train_data = []
train_label = []
test_data = []
test_label = []

for i in range(total_len) :
    data = csv[i][0:4]
    label = csv[i][4]
    if i < train_len :
        train_data.append(data)
        train_label.append(label)
    else :
        test_data.append(data)
        test_label.append(label)
# 데이터를 학습시키고 예측하기
clf = svm.SVC(gamma='auto')
clf.fit(train_data, train_label)
pre = clf.predict(test_data)
# 정답률 구하기
ac_score = metrics.accuracy_score(test_label, pre)

print("전체 데이터 수: %d" %total_len)
print("학습 전용 데이터 수: %d" %train_len)
print("테스트 데이터 수: %d" %(len(test_data)))
print("정답률 =", ac_score)
Beispiel #37
0
    def _test(metric_device):
        metric_device = torch.device(metric_device)
        acc = Accuracy(is_multilabel=True, device=metric_device)

        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long()
        y = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long()
        acc.update((y_pred, y))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(y_pred.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

        acc.reset()
        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long()
        y = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long()
        acc.update((y_pred, y))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(y_pred.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)

        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
        # check that result is not changed
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

        # Batched Updates
        acc.reset()
        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long()
        y = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long()

        batch_size = 16
        n_iters = y.shape[0] // batch_size + 1

        for i in range(n_iters):
            idx = i * batch_size
            acc.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(y_pred.cpu())  # (N, C, L, ...) -> (N * L * ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, L, ...) -> (N * L ..., C)

        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
Beispiel #38
0
#训练样本特征因子化
MONTH_t =pd.to_datetime(train['EVENT_DATE']).dt.month
MONTH_t = pd.get_dummies(MONTH_t)
number_t = pd.get_dummies(test['事件数'])
SHENFEN_t = pd.get_dummies(test['ADMIN1'])
test_set = pd.concat([MONTH_t,number,SHENFEN_t],axis=1)

x = train_set.loc[:,train_set.columns!='crime_type']
y = train_set['crime_type']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=1)

model = BernoulliNB()#伯努利模型
model.fit(x_train,y_train)
y_pred = model.predict(x_test)  #预测值#将预测值与实际值进行对比,输出精确度的值
print("伯努利model accuracy: ",metrics.accuracy_score(y_test,y_pred))
model_LR = LogisticRegression(C=0.1)#逻辑回归
model_LR.fit(x_train,y_train)
y_pred = model_LR.predict(x_test)
print("逻辑回归model accuracy: ",metrics.accuracy_score(y_test,y_pred))
model_RF = RandomForestClassifier()
model_RF.fit(x_train,y_train)
y_pred = model_RF.predict(x_test)
print("随机森林model accuracy: ",metrics.accuracy_score(y_test,y_pred))#随机森林






#image data mechine learning
from sklearn import model_selection, svm, metrics
# CSV 파일을 읽어 들이고 가공(이미지 데이터의 각 픽셀을 실수 벡터로 치환)
def load_csv(fname):
    labels = []
    images = []
    with open(fname, "r") as f:
        for line in f:
            cols = line.split(",")
            if len(cols) < 2: continue
            labels.append(int(cols.pop(0)))
            vals = list(map(lambda n: int(n) / 256, cols))
            images.append(vals)
    return {"labels":labels, "images":images}

data = load_csv("./mnist/train.csv")
test = load_csv("./mnist/t10k.csv")

clf = svm.SVC()
clf.fit(data["images"], data["labels"])
predict = clf.predict(test["images"])
# 결과 확인
ac_score = metrics.accuracy_score(test["labels"], predict)
cl_report = metrics.classification_report(test["labels"], predict)
print("정답률 =", ac_score)
print("리포트 =")
print(cl_report)
    clf = GridSearchCV(RandomForestClassifier(),
                       tuned_parameters,
                       cv=5,
                       scoring='%s' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print("Detailed confusion matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("Accuracy Score: \n")
    print(accuracy_score(y_true, y_pred))

    print()
Beispiel #41
0
X_train, X_test, y_train, y_test = train_test_split( X_data, Y_data, \
                                                    test_size = 0.3, \
                                                    train_size = 0.7, \
                                                    random_state = 100)

# Estimating the tree model - such as the ensemble comparison
RFC_Model = RandomForestClassifier(max_depth=20, random_state=100,\
                                   n_estimators=15)
RFC_Model.fit(X_train, y_train.ravel())
# print(RFC_Model.feature_importances_)

# Creating the model for automatic web api
pk.dump(RFC_Model, open('model.pkl', 'wb'))

# Predicting the output data
y_RFC_pred = RFC_Model.predict(X_test)

# Performance measure
accuracy_score(y_test, y_RFC_pred) * 100

# Another one
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_RFC_pred)
acc = 100 * (cm[0, 0] + cm[1, 1]) / (cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1])

print(acc)

# Predicting all data classification
y_RFC_pred_train = RFC_Model.predict(X_train)
accuracy_score(y_train, y_RFC_pred_train) * 100
#import pandas
import pandas as pd
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# load dataset
pima = pd.read_csv("diabetess.txt", header=None, names=col_names)
print(pima.head())
#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima.label # Target variable
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=45, solver='liblinear', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)

# fit the model with data
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
def run():
    #reading the dataseet
    dfx= pd.read_csv(config.TRAINING_FILE).fillna("none")

    dfx.sentiment= dfx.sentiment.apply(
        lambda x:1 if x=="positive" else 0
    )

    

    #splitting into training and validation set
    df_train,df_valid= model_selection.train_test_split(
        dfx,
        test_size=0.1,
        random_state=42,
        stratify= dfx.sentiment.values
    )

    df_train= df_train.reset_index(drop=True)
    df_valid= df_valid.reset_index(drop=True)

    train_dataset=dataset.BERTDataset(
        review=df_train.review.values,
        target=df_train.sentiment.values

    )

    train_data_loader= torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )


    valid_dataset = dataset.BERTDataset(
        review=df_valid.review.values,
        target=df_valid.sentiment.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    cuda= torch.cuda.is_available()

    if cuda:
        device= torch.device("cuda")

    else:
        device= torch.device("cpu")

    model= BERTBaseUncased()
    model.to(device)


    param_optimizer=list(model.named_parameters())
    no_decay=["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters=[
        {'params':[p for n , p in param_optimizer if not any(nd in n for nd in no_decay)],'weigt_decay':0.001},
        {'params':[p for n , p in param_optimizer if  any(nd in n for nd in no_decay)],'weigt_decay':0.001}
    ]


    # print("Printing optimizer parameters......******")
    # print(optimizer_parameters)
    # print("Printing optimizer parameters......******")

    num_train_steps= int(len(df_train)/config.TRAIN_BATCH_SIZE*config.EPOCHS)

    optimizer=AdamW(optimizer_parameters,lr=3e-5)

    schedular=get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    best_accuracy=0

    for epochs in range(config.EPOCHS):

        #calling the training function in engine.py file
        engine.train_fn(train_data_loader,model,optimizer,device,schedular)
        
        #calling the evaluation function from the engine.py file to compute evaluation
        outputs,targets=engine.eval_fn(valid_data_loader,model,device)

        outputs=np.array(outputs)>=0.5

        #calculating the accuracy after every epoch
        accuracy=metrics.accuracy_score(targets,outputs)
        print(f"Accuracy Score = {accuracy}")
        
        #updating the accuracy
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
import numpy as np
import pandas as pd
from sklearn import datasets
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
X = data.iloc[:, :-1].values
y = data.iloc[:, 4].values

from sklearn.cross_validation import train_test_split
# Splitting the data for training(70%) and testing(30%)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print("The training data is\n", X_train)
print("The testing data is\n", X_test)
print("The expected result is\n", y_test)
# By using LogisticRegression

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
# Training the dataset using Logistic Regression Model
clf.fit(X_train, y_train)
# Prediction

prediction = clf.predict(X_test)
print("The prediction by the machine is\n", prediction)
from sklearn.metrics import accuracy_score
# Finding the accuracy of the model

a = accuracy_score(y_test, prediction)
print("The accuracy of the model is:", a)
Beispiel #45
0
    def forward(self, inputs, mask=None):
        output = self.gcn_1(inputs, mask=mask)
        output = self.dropout(output)
        output = self.gcn_2([output] + inputs[1:], mask=mask)
        return output


if __name__ == "__main__":
    model = GraphClassifier(A[0].shape[0], HIDDEN, output_dimension, BASES, DO,
                            len(A))
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=L2)
    criterion = nn.CrossEntropyLoss()
    X = sparse.csr_matrix(A[0].shape).todense()
    for epoch in range(NB_EPOCH):
        t = time.time()
        output = model([X] + A)
        gold = y_train[idx_train].argmax(dim=-1)
        loss = criterion(output[idx_train], gold)
        score = accuracy_score(output[idx_train].argmax(dim=-1), gold)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print("train_accuracy:", score, "loss:,", loss.item(), "time:",
              time.time() - t)
        test_gold = y_test[idx_test].argmax(dim=-1)
        test_output = output[idx_test]
        test_score = accuracy_score(test_output.argmax(dim=-1), test_gold)
        test_loss = criterion(test_output, test_gold)
        print("test_accuracy:", test_score, "loss:", test_loss.item())
		Change_W,Change_B = BackPropagation(MATY,MATY_o,Netj,Oj,WEIGHTS)

		# print(len(WEIGHTS))
		# print(len(Change_W))
		for i in range(0,len(WEIGHTS)):
			WEIGHTS[i] -= alpha*Change_W[len(WEIGHTS)-i-1]
			BIAS[i] -= alpha*Change_B[len(WEIGHTS)-i-1]
		# print(batch)

		# break;
	print(epoch)


end3 = time.time()

y_pred = prediction(MATX1,WEIGHTS,BIAS)


accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
confusion_matrix = confusion_matrix(y_test,y_pred)
print(confusion_matrix)
end4 = time.time()


print("reading time" , end2-start)
print("training time" , end3-end2)
print("testing time" , end4-end3)


# In[ ]:


for clf in classifiers:
    print("="*30)
    
    clf_name = clf.__class__.__name__
    print(clf_name)

    clf.fit(x_train, y_train)
    
    #Training Accuracy
    y_train_pred = clf.predict(x_train)
    train_acc = accuracy_score(y_train, y_train_pred)
    
    #Validation Accuracy
    y_valid_pred = clf.predict(x_valid)
    valid_acc = accuracy_score(y_valid, y_valid_pred)
    
    print("Validation Accuracy: {:.4%}".format(valid_acc))
    
    log_entry = pd.DataFrame([[clf_name, train_acc, valid_acc]], columns=log_cols)
    log = log.append(log_entry)


# In[ ]:


log.sort_values('Validation Accuracy', ascending=True).plot.barh(x='Classifier', y='Validation Accuracy', figsize=(16,7))
  
print(X.head(n=20))

print(Y.head())



# Split the 'features' and 'income' data into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y,test_size = 0.2,random_state = 0)


clf = SVC()
clf.fit(X_train1, y_train1)
predicted = clf.predict(X_test1)

print("Accuracy = {}\nPrecision = {}\nRecall = {}\nF1 Score = {}".format(metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted)))

score_p.append([metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted)]) 

print(confusion_matrix(y_test1, predicted))


neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(X_train1, y_train1) 
predicted = neigh.predict(X_test1)
print("Accuracy = {}\nPrecision = {}\nRecall = {}\nF1 Score = {}".format(metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted)))

score_p.append([metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted)]) 

print(confusion_matrix(y_test1, predicted))
# Importing the required libraries.
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import classification_report


# In[106]:


# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train,y_train)
predmnb = mnb.predict(x_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))


# **The performance score of Naive Bayes classifier is 86.06. Since it is high score, I will treat this model as my baseline.**

# # 5.4.2 Random Forest Classifier

# There is no correlation between our feature(text) and target(review_stars) and this is the reason for choosing Random Forest Classifier.
# The vital thing for a Random Forest Classifier model to make an accurate class prediction is the trees of the forest and more importantly their predictions need to be uncorrelated (or at least have low correlations with each other).
# 
# Random forests are an ensemble learning method for classification. It operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees.

# In[107]:

Beispiel #50
0
def main():
    global start_dens
    st.title("Multi Variate Contamination in Fuel")

    menu = ["Home", "Model Training", "Contaminant Prediction"]
    choice = st.sidebar.selectbox("Menu", menu)

    if choice == "Home":
        st.subheader("Home")
        image_file = st.file_uploader("Upload Image",
                                      type=['png', 'jpeg', 'jpg'])
        if image_file is not None:
            # To See Details
            # st.write(type(image_file))
            # st.write(dir(image_file))
            file_details = {
                "Filename": image_file.name,
                "FileType": image_file.type,
                "FileSize": image_file.size
            }
            st.write(file_details)

            img = load_image(image_file)
            st.image(img, width=250)

    if choice == "Model Training":
        if st.button("Start Train"):
            allResults = glob.glob(
                'batch31-38_with_target_diff_prx_2000tampered/*.csv',
                recursive=True)
            allResults = sorted(allResults, key=lambda x: (x.split("/")[-1]))
            #st.write(allResults)

            newpath1 = 'batch31-38_with_target_diff_prx_2000tampered/'
            # newpath1='/content/drive/MyDrive/OIL SAMPLES DATA1/'

            folder = newpath1  ## data directory
            tag = str('.csv')  ## format to import
            initString = '-'  ## string in csv file name to search for category (normal, sludge, water, together)
            fileList = directorySearch(folder, tag)
            # print(fileList)
            final_filelist = pd.DataFrame(
                index=range(0, len(fileList)),
                columns=['file', 'Target', 'file_dir', 'window_id'])
            for i in range(0, (len(fileList))):
                fileName = fileList[i]
                res1 = fileName.find(initString)
                if res1 == -1:
                    res1 = fileName.find('_')
                if res1 == -1:
                    print(res1)
                    res1 = 5
                c1 = int(res1 + 1)
                c5 = int(res1 + 12)

                wloc = fileName.rfind('W', c1, c5)
                sloc = fileName.rfind('S', c1, c5)
                tloc = fileName.rfind('T', c1, c5)
                finalCat = max([wloc, sloc, tloc])
                strCat = fileName[finalCat]
                # print(strCat)

                classLabel = int(0)
                if strCat == 'S':
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat
                    # print(fileName,'---Sludge')
                    classLabel = int(1)
                if strCat == 'W':
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat
                    # print(fileName,'---Water')
                    classLabel = int(2)
                if strCat == 'T':
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat
                    # print(fileName,'--- Mix')
                    classLabel = int(3)
                if strCat not in ['S', 'T', 'W']:
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat

                final_filelist['file_dir'][i] = allResults[i]
                final_filelist['window_id'][i] = i + 1

            ll = []
            for i, j in enumerate(final_filelist['file']):
                # print(i,j)
                head, tail = os.path.split(j)
                r1 = re.split('_', tail)
                r2 = re.split('-', r1[0])
                print(r2)
                # if len(r2)==3 and int(r2[1]) < 37 and int(r2[1])<37 and not  'A' in r1[0] :
                if len(r2) == 3 and 'A' not in r2[2]:
                    ll.append(tail)
                elif len(r2) == 2 and 'A' not in (r2[1]):
                    ll.append(tail)
                elif len(r2) == 4 and 'A' not in (r2[3]):
                    ll.append(tail)

            dff = pd.DataFrame({'file': ll})
            dff['file'].count()

            df4 = pd.DataFrame()
            c = 0
            # for i,j in enumerate(allResults):
            for i, j in enumerate(dff['file']):
                # print(i,j)
                df = pd.read_csv(
                    'batch31-38_with_target_diff_prx_2000tampered/' + j)
                head, tail = os.path.split(j)
                # print(i,df.shape[1])
                df4[tail] = (df['Pressure_tmp'].rolling(300).std())

            df9 = pd.DataFrame(index=range(0, len(df4.columns)),
                               columns=[
                                   'file', 'pre-trans_mean', 'trans_mean',
                                   'post-trans_mean', 'transient_width'
                               ])

            for z, col in enumerate(df4.columns):
                start = 0
                end = 0

                a = df4[col]

                b = a.quantile(0.7)  # threshold set here : 70 percentile
                x = df4[col] > b  # find values greater than threshold

                # print(a)
                for i, j in enumerate(a):
                    # print(i,j)
                    if j > b:  # find value greater than threshold
                        start = i  # get the position of value greater than threshold
                        break
                for k, l in enumerate(
                        a[start:]
                ):  # now start checking from position that was marked earlier
                    # print(i,j)
                    if l < b and abs(
                            k
                    ) > 200:  # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part)
                        end = start + k
                        break
                df9['file'][z] = col
                df9['pre-trans_mean'][z] = (df4[col].iloc[:start].mean())
                df9['trans_mean'][z] = (df4[col].iloc[start:end].mean())
                df9['post-trans_mean'][z] = (df4[col].iloc[end:].mean())
                if (end - start) > 0:
                    df9['transient_width'][z] = end - start
                else:
                    df9['transient_width'][z] = 0

            df5 = df4.describe().transpose()
            df5 = df5.reset_index()
            df10 = pd.merge(df9,
                            df5[['index', 'std', 'max']],
                            left_on='file',
                            right_on='index',
                            how='left')
            del df10['index']
            df10 = df10.set_index('file')

            df11 = pd.merge(df10,
                            final_filelist[['file', 'Target']],
                            left_on='file',
                            right_on='file',
                            how='left')
            df11 = df11.set_index('file')
            df11 = df11.astype({
                'pre-trans_mean': 'float64',
                'trans_mean': 'float64',
                'post-trans_mean': 'float64',
                'transient_width': 'float64'
            })

            df12 = pd.DataFrame()
            for i, j in enumerate(dff['file']):
                # print(i,j)
                df = pd.read_csv(
                    'batch31-38_with_target_diff_prx_2000tampered/' + j)
                head, tail = os.path.split(j)
                # print(i,df.shape[1])
                df12[tail] = (df['Density'].rolling(300).std())

            df13 = pd.DataFrame(index=range(0, len(df12.columns)),
                                columns=[
                                    'file', 'pre-trans_mean-density',
                                    'trans_mean-density',
                                    'post-trans_mean-density',
                                    'transient_width-density'
                                ])

            for z, col in enumerate(df12.columns):
                start = 0
                end = 0
                print(col)  # file name
                a = df12[col]

                b = a.quantile(0.7)  # threshold set here : 70 percentile
                x = df12[col] > b  # find values greater than threshold
                # print(a)
                for i, j in enumerate(a):
                    # print(i,j)
                    if j > b:  # find value greater than threshold
                        start = i  # get the position of value greater than threshold
                        break
                for k, l in enumerate(
                        a[start:]
                ):  # now start checking from position that was marked earlier
                    # print(i,j)
                    if l < b and abs(
                            k
                    ) > 200:  # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part)
                        end = start + k
                        break
                df13['file'][z] = col
                df13['pre-trans_mean-density'][z] = (
                    df12[col].iloc[:start].mean())
                df13['trans_mean-density'][z] = (
                    df12[col].iloc[start:end].mean())
                df13['post-trans_mean-density'][z] = (
                    df12[col].iloc[end:].mean())
                if (end - start) > 0:
                    df13['transient_width-density'][z] = end - start
                else:
                    df13['transient_width-density'][z] = 0
            df13 = df13.astype({
                'pre-trans_mean-density': 'float64',
                'trans_mean-density': 'float64',
                'post-trans_mean-density': 'float64',
                'transient_width-density': 'float64'
            })
            df11.drop(['std'], axis=1, inplace=True)

            df14 = df13[[
                'file', 'pre-trans_mean-density', 'post-trans_mean-density'
            ]]
            df14['pre-trans_mean-density'] = df14[
                'pre-trans_mean-density'].fillna(0)

            df11.dropna(inplace=True)

            le = preprocessing.LabelEncoder()
            df11['Target'] = le.fit_transform(df11['Target'])
            df11.loc[:, 'Target']

            df15 = df11.merge(df14, how='inner', on='file')
            del df15['file']
            df15 = df15[[
                'pre-trans_mean', 'trans_mean', 'post-trans_mean',
                'transient_width', 'max', 'pre-trans_mean-density',
                'post-trans_mean-density', 'Target'
            ]]
            st.write(df15)
            col = df15.columns

            features = col.tolist()
            feature = features[:-1]
            target = features[-1]

            # x=dff_tr.loc[:,feature].values
            # y=dff_tr.loc[:,target].values
            x = df15.loc[:, feature].values
            y = df15.loc[:, target].values
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=0.3, random_state=98)
            ost = SMOTE()
            os_data_X, os_data_y = ost.fit_resample(x_train, y_train)
            os_data_X = pd.DataFrame(data=os_data_X, columns=feature)
            os_data_y = pd.DataFrame(data=os_data_y, columns=['Target'])

            # print('After Oversampling:')
            os_data_X, os_data_y = ost.fit_resample(x_train, y_train)
            clf_rf_bal = RandomForestClassifier(n_estimators=10,
                                                random_state=99)
            clf_rf_bal = clf_rf_bal.fit(os_data_X, os_data_y)

            #from sklearn.inspection import permutation_importance
            #results = permutation_importance(clf_rf_bal, x, y, scoring='accuracy')
            #importance = results.importances_mean
            # summarize feature importance
            #print('using permutaiton feature importance')
            #for i, v in enumerate(importance):
            #    print('Feature: %0d, Score: %.5f' % (i, v))
            #importance = clf_rf_bal.feature_importances_
            # summarize feature importance
            #print('using feature importance')
            #for i, v in enumerate(importance):
            #    print('Feature: %0d, Score: %.5f' % (i, v))

            bal_cm = confusion_matrix(y_test, clf_rf_bal.predict(x_test))
            y_pred_bal = clf_rf_bal.predict(x_test)

            print('balanced classification report')
            cls_rpt = classification_report(y_test, y_pred_bal)
            st.write(f'classification report : {cls_rpt}')

            bal_ac = accuracy_score(y_test, clf_rf_bal.predict(x_test))
            st.write(f'accuracy score : {bal_ac}')

            filename = 'finalized_model1.pkl'
            pickle.dump(clf_rf_bal,
                        open(os.path.join(os.getcwd(), filename), 'wb'))

    if choice == "Contaminant Prediction":
        st.subheader("Dataset")
        data_file = st.file_uploader("Upload CSV", type=['csv'])
        if st.button("Process") and data_file is not None:
            file_details = {
                "Filename": data_file.name,
                "FileType": data_file.type,
                "FileSize": data_file.size
            }
            st.write(file_details)

            df = pd.read_csv(data_file)
            st.dataframe(df)

            tag = str('.csv')  ## format to import
            initString = '-'  ## string in csv file name to search for category (normal, sludge, water, together)
            fileName = data_file.name
            # print(fileList)
            final_filelist = pd.DataFrame(columns=['file', 'Target'])
            res1 = fileName.find(initString)
            if res1 == -1:
                res1 = fileName.find('_')
            if res1 == -1:
                print(res1)
                res1 = 5
            c1 = int(res1 + 1)
            c5 = int(res1 + 12)

            wloc = fileName.rfind('W', c1, c5)
            sloc = fileName.rfind('S', c1, c5)
            tloc = fileName.rfind('T', c1, c5)
            finalCat = max([wloc, sloc, tloc])
            strCat = fileName[finalCat]

            st.write(f'FileName:{fileName}')
            if strCat not in ['S', 'T', 'W']:
                strCat = 'No Contaminant'
                st.write('No Contaminant')
            if strCat in ['S', 'T', 'W']:
                st.write('Contaminant Exists')
            if strCat == 'S':
                st.write('Type of Contaminant: Sludge')
            if strCat == 'W':
                st.write('Type of Contaminant: Water')
            if strCat == 'T':
                st.write('Type of Contaminant: Sludge+Water')

            df4 = pd.DataFrame()
            df4['roll_std'] = df['Pressure_tmp'].rolling(300).std()
            df5 = df4.describe().transpose()
            df5 = df5.reset_index()
            maxx = df5['max'][0]
            df9 = pd.DataFrame(columns=[
                'file', 'pre_trans_mean', 'trans_mean', 'post_trans_mean',
                'transient_width'
            ])

            # for col in df4.columns:
            # end = 0
            # print(col)  # file name
            # a = df4[col]
            a = df4['roll_std']
            st.write(a)

            b = a.quantile(0.7)  # threshold set here : 70 percentile
            # print(b)
            st.write(b)
            # x = df4[col] > b
            x = df4['roll_std'] > b  # find values greater than threshold
            # print(x.value_counts())
            # print(a)
            st.write(x)
            for i, j in enumerate(a):
                # print(i,j)
                if j > b:  # find value greater than threshold
                    start = i  # get the position of value greater than threshold
                    break
            for k, l in enumerate(
                    a[start:]
            ):  # now start checking from position that was marked earlier
                # print(i,j)
                if l < b and abs(
                        k
                ) > 200:  # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part)
                    end = start + k
                    break

            file = data_file.name
            pre_trans_mean = (df4['roll_std'].iloc[:start].mean())
            trans_mean = (df4['roll_std'].iloc[start:end].mean())
            post_trans_mean = (df4['roll_std'].iloc[end:].mean())
            if (end - start) > 0:
                transient_width = end - start
            else:
                transient_width = 0

            df12 = pd.DataFrame()
            df12['roll_std_den'] = (df['Density'].rolling(300).std())

            df13 = pd.DataFrame()
            df13 = pd.DataFrame(columns=[
                'file', 'pre_trans_mean_dens', 'trans_mean_dens',
                'post_trans_mean_dens', 'transient_width_dens'
            ])

            # for col in df4.columns:
            # end = 0
            # print(col)  # file name
            # a = df4[col]
            p = df12['roll_std_den']

            q = p.quantile(0.7)  # threshold set here : 70 percentile
            # print(b)
            # st.write(b)
            # x = df4[col] > b
            xx = df12['roll_std_den'] > q  # find values greater than threshold
            # print(x.value_counts())
            # print(a)
            # st.write(xx)
            for i, j in enumerate(p):
                # print(i,j)
                if j > q:  # find value greater than threshold
                    start_dens = i  # get the position of value greater than threshold
                    break
            for k, l in enumerate(
                    p[start_dens:]):  # now start checking from position
                # print(i,j)
                if l < q and abs(
                        k) > 200:  # find values that are less than threshold
                    end_dens = start_dens + k
                    break

            pre_trans_mean_dens = df12['roll_std_den'].iloc[:start_dens].mean()
            trans_mean_dens = df12['roll_std_den'].iloc[
                start_dens:end_dens].mean()
            post_trans_mean_dens = df12['roll_std_den'].iloc[end_dens:].mean()
            if (end_dens - start_dens) > 0:
                transient_width_dens = end_dens - start_dens
            else:
                transient_width_dens = 0

            zz = {
                'file': file,
                'pre_trans_mean': pre_trans_mean,
                'trans_mean': trans_mean,
                'post_trans_mean': post_trans_mean,
                'pre_trans_mean_dens': pre_trans_mean_dens,
                'trans_mean_dens': trans_mean_dens,
                'post_trans_mean_dens': post_trans_mean_dens
            }

            pre_trans_mean1 = pre_trans_mean
            trans_mean1 = trans_mean
            post_trans_mean1 = post_trans_mean
            transient_width1 = transient_width
            max1 = maxx
            pre_trans_mean_dens1 = pre_trans_mean_dens
            post_trans_mean_dens1 = post_trans_mean_dens

            st.write(zz)
            # load the model from disk
            loaded_model = pickle.load(open('finalized_model1.pkl', 'rb'))

            result = loaded_model.predict([[
                pre_trans_mean1, trans_mean1, post_trans_mean1,
                transient_width1, max1, pre_trans_mean_dens1,
                post_trans_mean_dens1
            ]])

            if result == 0:
                st.write(f'Predicted Contaminant: Sludge')
            if result == 1:
                st.write(f'Predicted Contaminant: Water')
            if result == 2:
                st.write(f'Predicted Contaminant: Water+Sludge')
            if result == 3:
                st.write('No Contaminant')
Beispiel #51
0
def scorer_01loss(estimator, X, y):
    y_pred = estimator.predict(X)
    return 1 - accuracy_score(y, y_pred)
Beispiel #52
0
print(x)
x.columns = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth']
colormap = n.array(['red', 'lime', 'black'])  # giving color for each
y = p.DataFrame(iris.target)
print(y)
y.columns = ['Target']
plt.figure(figsize=(14, 7))
plt.scatter(x.sepallength, x.sepalwidth, c=colormap[y.Target], s=40)
plt.title("Sepal data before model")
plt.show()
plt.figure(figsize=(14, 7))
plt.scatter(x.petallength, x.petalwidth, c=colormap[y.Target])
plt.title("petal data before model")
plt.show()
model = KMeans(n_clusters=2)
model.fit(x)
centroids = model.cluster_centers_  #give the centroid
print("centroids", centroids)
labels = model.labels_  ##labels of result
print(labels)
plt.figure(figsize=(14, 7))
plt.scatter(x.petallength, x.petalwidth, c=colormap[labels])
plt.title("petal data after model")
plt.show()

pred_y = n.choose(labels, [1, 0, 2]).astype(n.int64)
#print(labels)
print(pred_y)
print(so.accuracy_score(y, pred_y))
print(so.confusion_matrix(y, pred_y))
"""confusion matrix"""

results_for_conf_matrix = model.predict(x_test)
results_for_conf_matrix

results = []
for i in range(len(results_for_conf_matrix)):
  results.append(list(results_for_conf_matrix[i]).index(max(results_for_conf_matrix[i])))
cm=confusion_matrix(y_test, results)
cm
class_names=['Cloudy' ,'Rain' ,'Sun_shine','Sunrise']

"""accuracy: 89.7"""

from sklearn.metrics import accuracy_score
accuracy_score(y_test, results)

"""confusion_matrix_without_normalisation"""

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):

    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    #Compute confusion matrix
Beispiel #54
0
def judge_model(x_train, x_test, y_train, y_test, model):
    print('-' * 20)
    print('Baseline Performance')
    print('-> Acc:', accuracy_score(y_train, model.predict(x_train)))
    print('-> Acc:', accuracy_score(y_test, model.predict(x_test)))
Beispiel #55
0
joblib.dump(classifier, 'knn_model.sav')

# some time later...

# load the model from disk
loaded_model = joblib.load('knn_model.sav')
print('Accuracy of loaded model')
result = loaded_model.score(X_test, y_test)
print(result)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(y_pred)
from sklearn.metrics import accuracy_score
print('Accuracy Score: ', accuracy_score(y_pred,
                                         y_test))  #y_pred is the output

from sklearn.metrics import f1_score
f1_metric = f1_score(y_test, y_pred, average='macro')
print("f1 score macro:", f1_metric)

from sklearn.metrics import f1_score
f1_metric_micro = f1_score(y_test, y_pred, average='micro')
print("f1 score micro:", f1_metric_micro)

#take input from the loaded model
input_sepal_length = float(input("Enter sepal length: "))
input_sepal_width = float(input("Enter sepal width:"))
input_petal_length = float(input("Enter petal Length: "))
input_petal_width = float(input("Enter petal width: "))
output = loaded_model.predict([[
Beispiel #56
0
def scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return accuracy_score(y, y_pred)
Beispiel #57
0
def get_accu(model, data, y_gt):
	y_pred = model.predict_proba(data)[:,1]
	y_class= (y_pred>0.5)
	accu = accuracy_score(y_gt, y_class)
	return accu
train_data, test_data, train_labels, test_labels = train_test_split(
    all_tweets, labels, test_size=0.2, random_state=1)
print(len(train_data))
print(len(test_data))

#Making the Count Vectors
counter = CountVectorizer()
counter.fit(train_data)
train_counts = counter.transform(train_data)
test_counts = counter.transform(test_data)

print(train_data[3])
print(train_counts[3])

#Train and Test the Naive Bayes Classifier
classifier = MultinomialNB()
classifier.fit(train_counts, train_labels)
predictions = classifier.predict(test_counts)

#Evaluating Your Model via accuracy score (and confusion matrix)
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, predictions))
#Accuracy lies around 67.8%, which is ok, but not great

from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_labels, predictions))

#Testing the prediction function with own tweet
tweet = 'Pierre, the baquette is tasty'
tweet_counts = counter.transform([tweet])
print(classifier.predict(tweet_counts))
# In[]

#trainn = train.drop(['Name','Age', 'Ticket', 'Fare', 'Cabin'], axis = 1)

# In[]
X = traincopy[['Pclass', 'Sex', 'SibSp', 'Parch', 'Age']]
Y = traincopy['Survived']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)
# In[]
for k in range(1, 21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    pred = knn.predict(X_test)
    print("Accuracy is", accuracy_score(Y_test, pred) * 100, "for k = ", k)
    k = k + 1

# In[]

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=8)

# In[]

knn.fit(X_train, Y_train)
knn.score(X_test, Y_test)

# In[]

pred = knn.predict(testcopy)
Beispiel #60
0

target = dataset['Class']    

df=df.iloc[0:177,[1,12]]


sc=StandardScaler()
df=sc.fit_transform(df)

pca = PCA(n_components=2)
pca_x=pca.fit_transform(df)
pca_df = pd.DataFrame(data=pca_x,columns=['comp1','comp2'])


KModel = KMeans(n_clusters=3,random_state=2)
KModel.fit_predict(pca_df)
KModel.labels_


colormap = np.array(['Red','Blue','Green'])


z = plt.scatter(pca_df.comp1,pca_df.comp2,c = colormap[KModel.labels_])
KModel.labels_
accuracy_score(target,KModel.labels_)