def createNaiveBayesModel(feature_vector_data):
    '''
        Uses the dimensionally reduced feature vectors of each of the instance, sense id pairs
        to create a naive bayes model
    '''
    naive_bayes_model_word_type = {}
    
    for word_type, instance_sense_dict in feature_vector_data.iteritems():
        vectors = []
        senses  = []
        
        for i in xrange(len(instance_sense_dict)):
            sense = instance_sense_dict.keys()[i][1]
            data_type = instance_sense_dict.keys()[i][2]
            
            #Need to grab the TSNE vectors and senses of only the training data
            #Thus, we ignore all the validation data
            if  data_type == "training":
                vectors.append(instance_sense_dict.values()[i])
                senses.append(sense)
            
        vectors = np.array(vectors)
        senses = np.array(senses)
        nb = GaussianNB()
        nb.fit(vectors, senses)
        naive_bayes_model_word_type[word_type] = nb
    
    return naive_bayes_model_word_type
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
def NBAccuracy(features_train, labels_train, features_test, labels_test):
    """ compute the accuracy of your Naive Bayes classifier """
    ### import the sklearn module for GaussianNB
    from sklearn.naive_bayes import GaussianNB

    ### create classifier
    clf = GaussianNB()
    

    ### fit the classifier on the training features and labels
    clf.fit(features_train, labels_train)
    

    ### use the trained classifier to predict labels for the test features
    pred = clf.predict(features_test)


    ### calculate and return the accuracy on the test data
    ### this is slightly different than the example, 
    ### where we just print the accuracy
    ### you might need to import an sklearn module
    intersect = [i for i, j in zip(pred, labels_test) if i == j]
    matched = len(intersect)
    total = len(labels_test)
    accuracy = float(matched) / float(total)
    return accuracy
def selectKBest(previous_result, data):
	# remove 'restricted_stock_deferred' and 'director_fees'
	previous_result.pop(4)
	previous_result.pop(4)

	result = []
	_k = 10
	for k in range(0,_k):
		feature_list = ['poi']
		for n in range(0,k+1):
			feature_list.append(previous_result[n][0])

		data = featureFormat(my_dataset, feature_list, sort_keys = True, remove_all_zeroes = False)
		labels, features = targetFeatureSplit(data)
		features = [abs(x) for x in features]
		from sklearn.cross_validation import StratifiedShuffleSplit
		cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
		features_train = []
		features_test  = []
		labels_train   = []
		labels_test    = []
		for train_idx, test_idx in cv:
			for ii in train_idx:
				features_train.append( features[ii] )
				labels_train.append( labels[ii] )
			for jj in test_idx:
				features_test.append( features[jj] )
				labels_test.append( labels[jj] )
		from sklearn.naive_bayes import GaussianNB
		clf = GaussianNB()
		clf.fit(features_train, labels_train)
		predictions = clf.predict(features_test)
		score = score_func(labels_test,predictions)
		result.append((k+1,score[0],score[1],score[2]))
	return result
	def scikitNBClassfier(self):
		dataMat, labels = self.loadProcessedData()
		bayesian = Bayesian()
		myVocabList = bayesian.createVocabList(dataMat)
		## 建立bag of words 矩阵
		trainMat = []
		for postinDoc in dataMat:
			trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))

		from sklearn.naive_bayes import GaussianNB

		gnb = GaussianNB()
		X = array(trainMat)
		y = labels

		testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。"
		testEntry = self.testEntryProcess(testText)

		bayesian = Bayesian()
		thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
		## 拟合并预测
		y_pred = gnb.fit(X, y).predict(thisDoc)
		clabels = ['军事', '体育']
		y_pred = gnb.fit(X, y).predict(X)
		print("Number of mislabeled points : %d" % (labels != y_pred).sum())
def test_predict_on_toy_problem():
    """Manually check predicted class labels for toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()

    X = np.array([[-1.1, -1.5],
                  [-1.2, -1.4],
                  [-3.4, -2.2],
                  [1.1, 1.2],
                  [2.1, 1.4],
                  [3.1, 2.3]])

    y = np.array([1, 1, 1, 2, 2, 2])

    assert_equal(all(clf1.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
    assert_equal(all(clf2.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
    assert_equal(all(clf3.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))

    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='hard',
                            weights=[1, 1, 1])
    assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))

    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='soft',
                            weights=[1, 1, 1])
    assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
Example #7
0
def test_gnb_sample_weight():
    """Test whether sample weights are properly used in GNB. """
    # Sample weights all being 1 should not change results
    sw = np.ones(6)
    clf = GaussianNB().fit(X, y)
    clf_sw = GaussianNB().fit(X, y, sw)

    assert_array_almost_equal(clf.theta_, clf_sw.theta_)
    assert_array_almost_equal(clf.sigma_, clf_sw.sigma_)

    # Fitting twice with half sample-weights should result
    # in same result as fitting once with full weights
    sw = rng.rand(y.shape[0])
    clf1 = GaussianNB().fit(X, y, sample_weight=sw)
    clf2 = GaussianNB().partial_fit(X, y, classes=[1, 2], sample_weight=sw / 2)
    clf2.partial_fit(X, y, sample_weight=sw / 2)

    assert_array_almost_equal(clf1.theta_, clf2.theta_)
    assert_array_almost_equal(clf1.sigma_, clf2.sigma_)

    # Check that duplicate entries and correspondingly increased sample
    # weights yield the same result
    ind = rng.randint(0, X.shape[0], 20)
    sample_weight = np.bincount(ind, minlength=X.shape[0])

    clf_dupl = GaussianNB().fit(X[ind], y[ind])
    clf_sw = GaussianNB().fit(X, y, sample_weight)

    assert_array_almost_equal(clf_dupl.theta_, clf_sw.theta_)
    assert_array_almost_equal(clf_dupl.sigma_, clf_sw.sigma_)
Example #8
0
def NBAccuracy(features_train, labels_train, features_test, labels_test):
    """ compute the accuracy of your Naive Bayes classifier """
    ### import the sklearn module for GaussianNB
    from sklearn.naive_bayes import GaussianNB

    ### create classifier
    clf = GaussianNB()

    ### fit the classifier on the training features and labels
    clf.fit(features_train, labels_train)

    ### use the trained classifier to predict labels for the test features
    pred = clf.predict(features_test)
    


    ### calculate and return the accuracy on the test data
    ### this is slightly different than the example, 
    ### where we just print the accuracy
    ### you might need to import an sklearn module
    total = len(labels_test)
    correct = (pred == labels_test).sum()
    accuracy = correct/float(total)
    from sklearn.metrics import accuracy_score
    
    accuracy = accuracy_score(labels_test,pred )
    return accuracy
Example #9
0
def categorize(train_data,test_data,train_class,n_features):
    #cf= ExtraTreesClassifier()
    #cf.fit(train_data,train_class)
    #print (cf.feature_importances_)
    
    #lsvmcf = sklearn.svm.LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=100.0)  
    model = LogisticRegression()
    lgr = LogisticRegression(C=100.0,penalty='l1')    
    #knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=10, p=2, metric='minkowski', metric_params=None)
    svmlcf = sklearn.svm.SVC(C=1000.0, kernel='linear', degree=1, gamma=0.01,  probability=True)#2
    svmcf = sklearn.svm.SVC(C=1000.0, kernel='rbf', degree=1, gamma=0.01,  probability=True)#2
    cf = DecisionTreeClassifier() 
    dct = DecisionTreeClassifier(criterion='gini', splitter='best',  min_samples_split=7, min_samples_leaf=4)
    rf = RandomForestClassifier(n_estimators=10, criterion='gini',  min_samples_split=7, min_samples_leaf=4, max_features='auto')
    gnb = GaussianNB()  #1
    adbst = sklearn.ensemble.AdaBoostClassifier(base_estimator=rf, n_estimators=5, learning_rate=1.0, algorithm='SAMME.R', random_state=True)

    #ch2 = SelectKBest(chi2, k=n_features)
    #train_data = ch2.fit_transform(train_data, train_class)
    #test_data = ch2.transform(test_data)

    #rfe = RFE(svmlcf,n_features)
    #rfe = rfe.fit(train_data, train_class)
    gnb.fit(train_data,train_class)
    return gnb.predict(test_data)
class GaussianColorClassifier(ContourClassifier):
    '''
    A contour classifier which classifies a contour
    based on it's mean color in BGR, HSV, and LAB colorspaces,
    using a Gaussian classifier for these features.

    For more usage info, see class ContourClassifier
    '''
    FEATURES = ['B', 'G', 'R', 'H', 'S', 'V', 'L', 'A', 'B']

    def __init__(self, classes, **kwargs):
        super(GaussianColorClassifier, self).__init__(classes, **kwargs)
        self.classifier = GaussianNB()

    def get_features(self, img, mask):
        mean = cv2.mean(img, mask)
        mean = np.array([[mean[:3]]], dtype=np.uint8)
        mean_hsv = cv2.cvtColor(mean, cv2.COLOR_BGR2HSV)
        mean_lab = cv2.cvtColor(mean, cv2.COLOR_BGR2LAB)
        features = np.hstack((mean.flatten(), mean_hsv.flatten(), mean_lab.flatten()))
        return features

    def classify_features(self, features):
        return self.classifier.predict(features)

    def feature_probabilities(self, features):
        return self.classifier.predict_proba(features)

    def train(self, features, classes):
        self.classifier.fit(features, classes)
Example #11
0
def NB_experiment(data_fold, train, test, dumper):

    print "Ready to find the Best Parameters for Naive Bayes"

    print 'Gaussian Naive Bayes'
    nb = GNB()
    print "fitting NaiveBayes Experiment"

    dumper.write('Classifier: Naive Bayes\n')
    scores = cross_validation.cross_val_score(nb, train[0], train[1], 
                                              cv = data_fold, score_func=accus)

    reports = "Accuracy on Train: %0.2f (+/- %0.2f)"%(scores.mean(), scores.std()/2)
    print reports

    dumper.write(reports+'\n')
    reports = " ".join(['%0.2f'%(item) for item in scores])
    dumper.write(reports+'\n')
    
    nb = GNB()
    nb.fit(train[0], train[1])
    
    pred = clf_test(nb, test)
    output_ranking(pred, codecs.open('nb.ranking', 'w', 'utf-8'))
    return None
Example #12
0
def getGaussianPred(featureMatrix, labels, testSet, testSet_docIndex):
    """
    All input arguments are return of getTrainTestData()
    :param featureMatrix:
    :param labels:
    :param testSet:
    :param testSet_docIndex:
    :return docIndexPred: dict{docid: [index1, index2, ...], ...}
                        key is docid
                        value is all cognates' index
    """
    gnb = GaussianNB()
    gnb.fit(featureMatrix, labels)
    # pred = gnb.predict(featureMatrix)
    pred = gnb.predict(testSet)

    docIndexPred = dict()

    for i, p in enumerate(pred):
        if p:
            docid = testSet_docIndex[i, 0]
            index = testSet_docIndex[i, 1]
            if docid in docIndexPred:
                docIndexPred[docid].append(index)
            else:
                docIndexPred[docid] = [index]

    return docIndexPred
Example #13
0
def NBAccuracy(features_train, labels_train, features_test, labels_test):
    """ compute the accuracy of your Naive Bayes classifier """
    ### import the sklearn module for GaussianNB
    from sklearn.naive_bayes import GaussianNB

    ### create classifier
    clf = GaussianNB()

    t0 = time()
    ### fit the classifier on the training features and labels
    clf.fit(features_train, labels_train)
    print "training time:", round(time()-t0, 3), "s"

    ### use the trained classifier to predict labels for the test features
    import numpy as np
    t1 = time()
    pred = clf.predict(features_test)
    print "predicting time:", round(time()-t1, 3), "s"

    ### calculate and return the accuracy on the test data
    ### this is slightly different than the example,
    ### where we just print the accuracy
    ### you might need to import an sklearn module
    accuracy = clf.score(features_test, labels_test)
    return accuracy
Example #14
0
def gnbmodel(d,X_2,y_2,X_3,y_3,X_test,y_test):
    X_3_copy = X_3.copy(deep=True)
    X_3_copy['chance']=0
    index = 0    
    
########## k折交叉验证 ###########################
    scores = cross_val_score(GaussianNB(), X_2, y_2, cv=5, scoring='accuracy')
    score_mean =scores.mean()
    print(d+'5折交互检验:'+str(score_mean))
#################################################
    
    gnb = GaussianNB().fit(X_2,y_2)

################ 预测测试集 ################   
    answer_gnb = gnb.predict(X_test)
    accuracy = metrics.accuracy_score(y_test,answer_gnb)
    print(d+'预测:'+str(accuracy))
###############################################
    
    chance = gnb.predict_proba(X_3)[:,1]
    for c in chance:
        X_3_copy.iloc[index,len(X_3_copy.columns)-1]=c
        index += 1
    chance_que = X_3_copy.iloc[:,len(X_3_copy.columns)-1]
    return chance_que
def performNB(trainingScores, trainingResults, testScores):
	print "->Gaussian NB"
	X = []
	for currMark in trainingScores:
		pass
	for idx in range(0, len(trainingScores[currMark])):
		X.append([])

	for currMark in trainingScores:
		if "Asym" in currMark:
			continue
		print currMark, 
		for idx in range(0, len(trainingScores[currMark])):
			X[idx].append(trainingScores[currMark][idx])

	X_test = []
	for idx in range(0, len(testScores[currMark])):
		X_test.append([])

	for currMark in trainingScores:
		if "Asym" in currMark:
			continue
		for idx in range(0, len(testScores[currMark])):
			X_test[idx].append(testScores[currMark][idx])
	gnb = GaussianNB()
	gnb.fit(X, np.array(trainingResults))
	y_pred = gnb.predict_proba(X_test)[:, 1]
	print "->Gaussian NB"
	return y_pred
Example #16
0
def main(argv):
    if len(argv) != 5:
        print "./NB_train_pred.py train.csv train_lable test.csv save_folder label_idx"
        sys.exit(1);

    output_folder = argv[3]
    label_idx = int(argv[4])

    os.system("mkdir " + output_folder)

    print "Loading training data"
    train_array = np.load(argv[0])
    print "Loading training label"
    train_label_array = np.load(argv[1])
    print "Loading test data"
    test_array = np.load(argv[2])
    
    print "building NB on label " + str(label_idx)
    gnb = GaussianNB() 
    model = gnb.fit(train_array[:, 1:], train_label_array[1:, label_idx]) 

    print "predicting label " + str(label_idx)
    nb_pred = gnb.predict(test_array[:,1:])
    print "save the result"
    with open(output_folder + "/" + str(label_idx) + ".pred", 'w') as pred_file:
        pred_file.write("\n".join([ str(x) for x in nb_pred.tolist()]))
    with open(output_folder+"/"+str(label_idx) + ".npy", 'wb') as npy_file:
        np.save(npy_file, nb_pred)
Example #17
0
def NBAccuracy(features_train, labels_train, features_test, labels_test):
	#Import sklearn modules for GaussianNB
	from sklearn.naive_bayes import GaussianNB
	from sklearn.metrics import accuracy_score
	
	#Create classifer
	classifer = GaussianNB();
	
	#Timing fit algorithm
	t0 = time();
	
	#Fit classier on the training features
	classifer.fit(features_train, labels_train);
	
	print "Training Time: ", round(time() - t0, 3), "s";
	
	GaussianNB();
	
	#Timing prediction algorithm
	t0=time();
	
	#Use trained classifer to predict labels for test features
	pred = classifer.predict(features_test);
	
	print "Prediction Time: ", round(time() - t0, 3), "s";
	
	#Calculate accuracy from features_test with answer in labels_test
	
	accuracy = accuracy_score(pred, labels_test);
	
	return accuracy;
Example #18
0
def NB_predict(mtx_train,label_train,mtx_test,label_test):
    G_NB = GaussianNB()
    label_train = np.ravel(label_train)

    #start = timeit.default_timer()
    clf_nb = G_NB.fit(mtx_train,label_train)
    #stop = timeit.default_timer()
    #time_interval = stop - start
    #print ("predict time is %f" %time_interval)


    #start = timeit.default_timer()
    pCVR = clf_nb.predict_proba(mtx_test)
    #stop = timeit.default_timer()
    #time_interval = stop - start
    #print ("predict time is %f" %time_interval)


    ####### Evaluation
    #fpr,tpr,thresholds = roc_curve(label_test,pCVR[:,1])
    #roc_auc = auc(fpr,tpr)
    predict_CVR = np.mean(pCVR[:,1])
    #print("LR predicted CVR is %.5f" % predict_CVR)
    auc_score = roc_auc_score(label_test,pCVR[:,1])
    #print("ROC AUC score for LR is %.4f" % auc_score)
    lg_rmse = sqrt(mean_squared_error(label_test, pCVR[:,1]))
    #print("rmse is %.4f" % lg_rmse)

    return pCVR, predict_CVR, auc_score, lg_rmse
Example #19
0
def main(unused_argv):


    x,y=load_data()

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

    vp = learn.preprocessing.VocabularyProcessor(max_document_length=MAX_DOCUMENT_LENGTH, min_frequency=1)

    x_train = np.array(list(vp.fit_transform(x_train)))
    x_test = np.array(list(vp.transform(x_test)))
    n_words=len(vp.vocabulary_)
    print('Total words: %d' % n_words)

    gnb = GaussianNB()
    y_predict = gnb.fit(x_train, y_train).predict(x_test)
    score = metrics.accuracy_score(y_test, y_predict)
    print('NB Accuracy: {0:f}'.format(score))

    feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(x_train)
    classifier = tf.contrib.learn.DNNClassifier(
        feature_columns=feature_columns, hidden_units=[500,10], n_classes=2)

    classifier.fit(x_train, y_train, steps=5000, batch_size=10)
    y_predict=list(classifier.predict(x_test, as_iterable=True))
    score = metrics.accuracy_score(y_test, y_predict)
    print('DNN Accuracy: {0:f}'.format(score))
def test_gnb_priors():
    """Test whether the class prior override is properly used"""
    clf = GaussianNB(priors=np.array([0.3, 0.7])).fit(X, y)
    assert_array_almost_equal(clf.predict_proba([[-0.1, -0.1]]),
                              np.array([[0.825303662161683,
                                         0.174696337838317]]), 8)
    assert_array_equal(clf.class_prior_, np.array([0.3, 0.7]))
def classifyNB():
    print 'Classify..'
    target_names = ['unacc', 'acc','good','v-good']
    df = pd.read_csv("data/cars-cleaned.txt", delimiter=",");    
    print df
    print df.dtypes
    df_y = df['accept']
    df_x = df.ix[:,:-1]

    #print df_y
    #print df_x
    train_y, test_y, train_x, test_x = train_test_split(df_y, df_x, test_size = 0.3, random_state=33)
    
    clf = GaussianNB()
    tstart=time.time()
    model = clf.fit(train_x, train_y)
    print "training time:", round(time.time()-tstart, 3), "seconds"
    y_predictions = model.predict(test_x)
    print "Accuracy : " , model.score(test_x, test_y)
    #print y_predictions
    c_matrix = confusion_matrix(test_y,y_predictions)
    print "confusion matrix:"
    print c_matrix
    
    plt.matshow(c_matrix)
    plt.colorbar();
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)    
    plt.ylabel('true label')
    plt.xlabel('predicted label')
    plt.show()
class GaussianNBClassifier:

	def __init__(self):
		"""
		This is the constructor responsible for initializing the classifier
		"""
		self.outputHeader = "#gnb"
		self.clf = None

	def buildModel(self):
		"""
		This builds the model of the Gaussian NB classifier
		"""
		self.clf =  GaussianNB()

	def trainGaussianNB(self,X, Y):
		"""
		Training the Gaussian NB Classifier
		"""
		self.clf.fit(X, Y)

	def validateGaussianNB(self,X, Y):
		"""
		Validate the Gaussian NB Classifier
		"""
		YPred = self.clf.predict(X)
		print accuracy_score(Y, YPred)

	def testGaussianNB(self,X, Y):
		"""
		Test the Gaussian NB Classifier
		"""
		YPred = self.clf.predict(X)
		print accuracy_score(Y, YPred)
def classify(features_train, labels_train):
    clf = GaussianNB()
    clf.fit(features_train, labels_train)
    ### import the sklearn module for GaussianNB
    ### create classifier
    ### fit the classifier on the training features and labels
    return clf
def test_gnb_prior():
    # Test whether class priors are properly set.
    clf = GaussianNB().fit(X, y)
    assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8)
    clf.fit(X1, y1)
    # Check that the class priors sum to 1
    assert_array_almost_equal(clf.class_prior_.sum(), 1)
Example #25
0
def nb_names():
	#generate list of tuple names
	engine = create_engine('sqlite:///names.db')
	DBSession = sessionmaker(bind=engine)
	session = DBSession()
	db_names = names.Names.getAllNames(session)
	names_list = [(x,'name') for x in db_names]
	words_list = generate_words()
	sample_names = [names_list[i] for i in sorted(random.sample(xrange(len(names_list)), len(words_list)))]

	data = sample_names + words_list
	shuffled_data = np.random.permutation(data)
	strings = []
	classification = []
	for item in shuffled_data:
		strings.append([item[0]])
		classification.append(str(item[1]))


	X = np.array(strings)
	Y = np.array(classification)

	print X,Y
	clf = GaussianNB()
	clf.fit(X, Y)
Example #26
0
def trainNB():
    

    featureVector = []
    classVector = []
    temp= []
    headerLine = True


    #training
    train = open(r'C:\Python34\alchemyapi_python\TrainingDataDummy.csv')

    for line in train:
        if(headerLine):
            headerLine = False
        else:
            temp = line.split(",")
            x = [float(temp[i]) for i in activeFeatureIndex]
            #print(x)
            featureVector.append(x)
            #temp = [int(x) for x in line.split(",")[-1].rstrip("\n")]
            classVector.append(int(line.split(",")[-1].rstrip("\n")))

        
    fVector = np.array(featureVector)
    cVector = np.array(classVector)
    #print(classVector)
    print(fVector.shape)
    print(cVector.shape)

    clf = GaussianNB()
    clf.fit(fVector,cVector)
    train.close()

    return clf
Example #27
0
class CruiseAlgorithm(object):
	# cruise algorithm is used to classify the cruise phase vs noncruise phase, it uses the differential change in data stream as the input matrix
	def __init__(self, testing=False):
		self.core = GaussianNB()
		self.scaler = RobustScaler()
		self.X_prev = None
		self.testing = testing
	def fit(self,X,Y): # Y should be the label of cruise or not
		X = self.prepare(X)
		self.core.fit(X,Y.ravel())
	def predict(self, X):
		if self.testing:
			X_t = self.prepare(X)
		else:
			if self.X_prev:
				X_t = X - self.X_prev
			else:
				X_t = X
			self.X_prev = X

		print repr(X_t)
		prediction_result = self.core.predict(X_t)
		return np.asmatrix(prediction_result)

	def prepare(self,X):
		a = np.zeros((X.shape[0],X.shape[1]))
		for i in xrange(X.shape[0]-1):
			a[i+1,:] = X[i+1] - X[i]
		return a
Example #28
0
def naive_bayes(features, labels):
    classifier = GaussianNB()
    classifier.fit(features, labels)
    scores = cross_validation.cross_val_score(
        classifier, features, labels, cv=10, score_func=metrics.precision_recall_fscore_support
    )
    print_table("Naive Bayes", numpy.around(numpy.mean(scores, axis=0), 2))
def univariateFeatureSelection(f_list, my_dataset):
	result = []
	for feature in f_list:
		# Replace 'NaN' with 0
		for name in my_dataset:
			data_point = my_dataset[name]
			if not data_point[feature]:
				data_point[feature] = 0
			elif data_point[feature] == 'NaN':
				data_point[feature] =0

		data = featureFormat(my_dataset, ['poi',feature], sort_keys = True, remove_all_zeroes = False)
		labels, features = targetFeatureSplit(data)
		features = [abs(x) for x in features]
		from sklearn.cross_validation import StratifiedShuffleSplit
		cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
		features_train = []
		features_test  = []
		labels_train   = []
		labels_test    = []
		for train_idx, test_idx in cv:
			for ii in train_idx:
				features_train.append( features[ii] )
				labels_train.append( labels[ii] )
			for jj in test_idx:
				features_test.append( features[jj] )
				labels_test.append( labels[jj] )
		from sklearn.naive_bayes import GaussianNB
		clf = GaussianNB()
		clf.fit(features_train, labels_train)
		predictions = clf.predict(features_test)
		score = score_func(labels_test,predictions)
		result.append((feature,score[0],score[1],score[2]))
	result = sorted(result, reverse=True, key=lambda x: x[3])
	return result
Example #30
0
File: nb.py Project: mkdmkk/infaas
class PatternBasedDiagnosis:
    """
    Pattern Based Diagnosis with Decision Tree
    """

    __slots__ = [
        "model"
    ]

    def __init__(self):
        pass

    def train(self, data, labels):
        """
        Train the decision tree with the training data
        :param data:
        :param labels:
        :return:
        """
        print('Training Data: %s' % (data))
        print('Training Labels: %s' % (labels))
        self.model = GaussianNB()
        self.model = self.model.fit(data, labels)

    def eval(self, obs):
        # print('Testing Result: %s; %s' % (self.model.predict(obs), self.model.predict_proba(obs)))
        print('Testing Result: %s' % self.model.predict(obs))
Example #31
0
                (usernum))  #dist为去重后的序列
            #  print  ("该用户的去重向量表Dist:(%s)" % dist)
            user_cmd_feature = get_user_cmd_feature_all(
                user_cmd_list, dist)  #150个向量,每个向量有len(dist)个分量,1或0表示

            labels = get_label("D:/ml/用户异常行为检测/MasqueradeDat/label.txt",
                               usernum - 1)
            y = [0] * 50 + labels  #加上前50个正常的序列标签

            x_train = user_cmd_feature[0:N]  #取前N(100)个训练集(序列向量,样本特征集)
            y_train = y[0:N]  #取前N个对应的样本特征标签

            x_test = user_cmd_feature[N:150]  #测试集特征集
            y_test = y[N:150]  #测试集特征标签

            clf = GaussianNB().fit(x_train, y_train)
            y_predict = clf.predict(x_test)
            score = np.mean(y_test == y_predict) * 100

            print('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum), y_test)

            print('   NB预测的后50个操作序列特征标签是(0为正常):', y_predict.tolist())
            print('NB异常操作的预测准确率是:', score)
            target_name = ['正常', '异常']
            print(
                classification_report(y_test,
                                      y_predict,
                                      target_names=target_name))
            print(
                model_selection.cross_val_score(clf,
                                                user_cmd_feature,
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    svm.SVC(probability=True),
    DecisionTreeClassifier(),
    XGBClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()
]

log_cols = ["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)

# In[ ]:

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

SSplit = StratifiedShuffleSplit(test_size=0.3, random_state=7)
acc_dict = {}
#KNN classifier 
from sklearn.neighbors import KNeighborsClassifier

KNN_model = KNeighborsClassifier(n_neighbors=5)

# Train the model usinfit(X_train, y_train)g the training sets
KNN_model.fit(train_F_scaled,train_response)


# In[79]:


#naive bayes

from sklearn.naive_bayes import GaussianNB
naive_bayes_model = GaussianNB()
naive_bayes_model .fit(train_predictor,train_response)


# In[80]:


y_pred = naive_bayes_model.predict(train_predictor)

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          train_predictor.shape[0],
          (train_response != y_pred).sum(),
          100*(1-(response != y_pred).sum()/train_predictor.shape[0])))
Example #34
0
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

#Gaussian naive Bayes: data from each label is drawn from simple Gaussian distribution

from sklearn.datasets import make_blobs
X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu');

#find mean and standard deviation of points within a label, which defines the distribution
#can then compute posterior ratio for given point

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y);

rng = np.random.RandomState(0)
Xnew = [-6, -14] + [14, 18] * rng.rand(2000, 2)
ynew = model.predict(Xnew)

plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
lim = plt.axis()
plt.scatter(Xnew[:, 0], Xnew[:, 1], c=ynew, s=20, cmap='RdBu', alpha=0.1)
plt.axis(lim);

#in general, boundary in Gaussian naive Bayes is quadratic
#allows for probabilistic classification

yprob = model.predict_proba(Xnew)
Example #35
0
def ModelParam_GridSearch(X_train, y_train, cv=4,scoreParam = 'f1'):
    '''
    Basic grid searchCV for multiple classifiers' perf & parameters.
    This is very limited and computationally expensive.
    Not guaranteed to reach even a local optima, but good to get a
    rough idea of parameters for the classifiers. (Does not address pre-processing)
    More classifiers can be added as desired, and parameters expanded.

    Later: Add options for RBM + Logit; PCA; ICA; LDA.
     See also
    http://scikit-learn-laboratory.readthedocs.org/en/latest/_modules/skll/learner.html

    TODO: Add parameters + put classifiers/"pipeline_#" in a list. (To allow checking only some params)
    '''

#    pipeline1 = Pipeline('clf', RandomForestClassifier() )
#
#    pipeline2 = Pipeline(
#    ('clf', KNeighborsClassifier()),)
    pipeline1 = RandomForestClassifier(n_jobs=-1)
    pipeline2 = KNeighborsClassifier()

    pipeline3 = SVC(cache_size=1500)
    # pipeline3 = NuSVC(cache_size=1500)

    pipeline4 = GaussianNB()
    pipeline5 = GradientBoostingClassifier()
    pipeline6 = SGDClassifier()
    pipeline7 = LogisticRegression()


    'RandomForestClassifier:'
    parameters1 = {
    'n_estimators': [150],
    'criterion': ['gini'],
    'max_features': ['auto',0.4],
    'max_depth': [8,None],
    'min_samples_leaf':[1,2],
    'min_samples_split':[2,4],
    'n_jobs':[-1]
    }
    #, 'entropy'
        # 'n_jobs':[-1]

    'KNeighborsClassifier:'
    parameters2 = {
    'n_neighbors': [7],
    'weights': ['distance']
    }

    'SVC:'
    parameters3 = {
    'C': [0.01,0.1, 1,10,100],
    'kernel': ['linear','rbf'],
    'gamma': [0.1,0.0, 1.0,20],
    'cache_size':[1500],
    'class_weight':['auto'],
    }
# , 'poly','sigmoid']

##    'GaussianNB:'
##    parameters4 = {}

    'GradientBoostingClassifier'
    parameters5 = {
    'max_depth':[3,5,8],
    'n_estimators': [100],
    'min_samples_leaf':[1,2],
    'learning_rate': [0.1, 0.01],
    'max_features': ['auto',0.4]
    }
    'SGDClassifier:'
    parameters6 = {
     'alpha': [0.00001,0.001,0.01],
    'penalty': ['l1','l2', 'elasticnet'],
    'n_iter': [300],
    'loss':['hinge'],
    'n_jobs':[-1],
    'class_weight':['auto']
    }
#, 'modified_huber','log'

    'LogisticRegression:'
    parameters7 = {
    'C': [0.001,0.01, 0.1, 1.0,10,100],
    'penalty': ['l1','l2'],
    'class_weight':['auto']
    }

    'TODO: make this into a seperate method, with pars, pips passed to it as params'
    pars = [parameters1, parameters2, parameters3,parameters5,parameters6,parameters7] #parameters4
    pips = [pipeline1, pipeline2, pipeline3,pipeline5,pipeline6,pipeline7] # pipeline4,

    print ("Starting Gridsearch To find each model's best parameters")
    for i in range(len(pars)):
        print(pips[i])

        gs = GridSearchCV(estimator=pips[i], param_grid=pars[i],
                          verbose=0, refit=True, n_jobs=-1,iid=False,
                          pre_dispatch='2*n_jobs',scoring=scoreParam,
                          fit_params={'sample_weight': balance_weights(y)},
                          cv=StratifiedKFold(y_train,n_folds=cv,shuffle=True))
#Valid scoring options: ['accuracy', 'average_precision', 'f1', 'precision', 'recall', 'roc_auc']
        # gs = gs.fit(X_train, y_train)
        'http://stackoverflow.com/questions/13051706/scikit-learn-using-sample-weight-in-grid-search?rq=1'
        'Note: Remove "class_weight=auto"  from the autoweighting classifiers!!'
        "Set Class weights (then into sample weights: https://github.com/scikit-learn/scikit-learn/blob/8dab222cfe894126dfb67832da2f4e871b87bce7/sklearn/utils/class_weight.py"
        gs.fit(X_train, y_train)
        #print ("Finished Gridsearch")
        #print (gs.best_score_)
        report(gs.grid_scores_)
        # http://stackoverflow.com/questions/18210799/scikit-learn-sample-try-out-with-my-classifier-and-data

        'Get more exhaustive CV results with the best tuned parameters for the model'
        est = gs.best_estimator_
        scores = cross_val_score(est, X_train,
                                 y_train,
                                 cv=StratifiedShuffleSplit(y=y_train, n_iter=10, test_size=0.2),scoring=scoreParam,
                                 n_jobs=-1, pre_dispatch='1.8*n_jobs')
        print("Tuned Model's %s Score: %0.3f (+/- %0.3f)" % (scoreParam,scores.mean(), scores.std() * 2))
Example #36
0
# scatter_matrix(dataset)
# plt.show()

array = dataset.values
X = array[:, 0:4]
Y = array[:, 4]
validation_size = 0.20
seed = 7
scoring = 'accuracy'
X_train, X_validation, Y_train, Y_validation = \
    model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

# Spot Checking
models = [('LR', LogisticRegression()), ('LDA', LinearDiscriminantAnalysis()),
          ('KNN', KNeighborsClassifier()), ('CART', DecisionTreeClassifier()),
          ('NB', GaussianNB()), ('SVM', SVC())]

results = []
names = []

# Shows KNN as the most accurate model
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model,
                                                 X_train,
                                                 Y_train,
                                                 cv=kfold,
                                                 scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "{}: {} ({})".format(name, round(cv_results.mean(), 3),
Example #37
0
train.fillna(train.mean(),inplace=True)

test=test.drop(['Name','Ticket','Cabin','PassengerId'],axis='columns')

test['Sex']=test['Sex'].replace({'male':0,'female':1})
test['Embarked']=test['Embarked'].replace({'S':1,'Q':2,'C':3})

test.fillna(test.mean(),inplace=True)

models=[]
models.append(('CART',DecisionTreeClassifier()))
models.append(('RF',RandomForestClassifier()))
models.append(('LR',LogisticRegression()))
models.append(('PPN',Perceptron()))
models.append(('NB',GaussianNB()))
models.append(('SVM',SVC()))

results=[]
names=[]

for name,model in models:
	scores=cross_val_score(model,train,target,cv=10,scoring='accuracy')
	results.append(scores.mean())
	names.append(name)

print names
print results

#fig=plt.figure()
#fig.suptitle('Algorithm Comparison')
Example #38
0
X = cv.fit_transform(corpus).toarray()

# Step 10: Creating dependent Variable
y = dataset.iloc[:, 1].values

# Step 11: ***//Classification//***
# Based on experiance NLP best fits for //Naive Bayes, Decision Tree, Random Forest//
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
# Choose more Datasets for training and less to test,bcz of 1500 datasets

# NO Need of Feature Scaling becz, most of them are Zero and 1

#Lets use Naive bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

#predicting Test Results
y_pred = classifier.predict(X_test)

# confussion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#calculating Accuracy
(55 + 91) / 200 = .73 (accuracy)
Example #39
0
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
                                                    shuffle=True)

#Baseline - Nearest centroid
tic = timeit.default_timer()
nc = NearestCentroid()
nc.fit(x_train, y_train)
print('Train-acc:', nc.score(x_train, y_train))
print('Test-acc:', nc.score(x_test, y_test))
toc = timeit.default_timer()
elapsed_time = toc - tic
NC_time = elapsed_time
print('Elapsed time: ', elapsed_time, 'seconds')

#Gaussian Naive Bayes - Gaus eloszlást feltételezünk
tic = timeit.default_timer()
model = GaussianNB()
model.fit(x_train, y_train)
toc = timeit.default_timer()
print('Train-acc:', model.score(x_train, y_train))
print('Test-acc:', model.score(x_test, y_test))
toc = timeit.default_timer()
elapsed_time = toc - tic
GNB_time = elapsed_time
print('Elapsed time: ', elapsed_time, 'seconds')

#Konfidencia intervallum - ellenőrizni
predicted_test = model.predict(x_test)
test_acc = accuracy_score(y_test, predicted_test)
n_success = np.sum(y_test == predicted_test)

p = 0.91
        stemmed.append(stemmer.stem(item))
    return stemmed
def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

#obtain stop words
stop_words = text.ENGLISH_STOP_WORDS

#define pipeline for tokenizing, feature extraction, feature selection, and naïve Bayes algorithm
text_clf = Pipeline([('vect', CountVectorizer(tokenizer=tokenize, stop_words=stop_words,analyzer='word')),
                     ('tfidf', TfidfTransformer()),
                      ('dimensionality_reduction',TruncatedSVD(n_components=50, random_state=42)),
                     ('clf', GaussianNB()),
])

text_clf = text_clf.fit(x_train, y_train)

#test data validation
predicted = text_clf.predict(x_test)
print np.mean(predicted == y_test)

#print the statistic summary and confusion matrix
names = ['comp.sys.ibm.pc.hardware' , 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian']
print(metrics.classification_report(y_test, predicted,
    target_names = names))

print metrics.confusion_matrix(y_test, predicted)
Example #42
0
#3.data precessing
# select features and Normalization
x = Data.loc[:, [
    'ClumpTkns', 'UnofCSize', 'UnofCShape', 'MargAdh', 'SngEpiCSize',
    'BareNuc', 'BlandCrmtn', 'NrmlNuc', 'Mitoses'
]]
y = Data['Malignant']
# TRANSFROM GIVES 1 % LESS ACCURATE RESULT!!!!
min_max_scaler = preprocessing.MaxAbsScaler()
x = min_max_scaler.fit_transform(x)

# 4.train model and performed testing using logistic regression
# using SVM
print("\nSelected Algorithm: GaussianNB")
clf = GaussianNB()

scores = cross_val_score(clf, x, y, cv=5)
predictions = cross_val_predict(clf, x, y, cv=5)
accuracy = metrics.r2_score(y, predictions)
#print("\nCross-validation scores: {}".format(scores))
print("\nmean training result = {}".format(np.mean(scores)))
print("\nCross-predicted accuracy: {}\n".format(accuracy))
"""
#submission
print("Writing submission.csv file...")
index = [i for i in range(Data.shape[0])]
df2 = pd.DataFrame({'Predictions': predictions}, index=index)
submission = pd.concat([Data, df2], axis=1)
submission.to_csv('wresult.csv', index=False)
"""
Example #43
0
        df = pd.read_table(file, header=None)
        label = np.concatenate((label, df))
    label = np.ravel(label)  #将label降成1维
    return feature, label


if __name__ == '__main__':
    feature_paths = [
        r'A.feature', r'B.feature', r'C.feature', r'D.feature', r'E.featurE'
    ]
    label_paths = [r'A.label', r'B.label', r'C.label', r'D.label', r'E.label']
    x_train, y_train = load_dataset([feature_paths[0]], [label_paths[0]])
    x_test, y_test = load_dataset([feature_paths[1]], [label_paths[1]])
    x_train, x_, y_train, y_ = train_test_split(
        x_train, y_train, test_size=0.0)  #由于test_size=0,所以x_,y_都是None,作用是乱序
    print('start traing knn')
    knn = KNeighborsClassifier().fit(x_train, y_train)
    a_knn = knn.predict(x_test)
    print('start traing dt')
    dt = DecisionTreeClassifier().fit(x_train, y_train)
    a_dt = dt.predict(x_test)
    print('start traing gusaanb')
    gnb = GaussianNB().fit(x_train, y_train)
    a_gnb = gnb.predict(x_test)
    print('test knn')
    print(classification_report(y_test, a_knn))
    print('test db')
    print(classification_report(y_test, a_dt))
    print('test guassnb')
    print(classification_report(y_test, a_gnb))
Example #44
0
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline
from matplotlib.font_manager import *

myfont = FontProperties(fname='C:\Windows\Fonts\simfang.ttf') 
RANDOM_STATE = 42
FIG_SIZE = (10, 7)
features, target = load_wine(return_X_y=True)
# Make a train/test split using 30% test size
X_train, X_test, y_train, y_test = train_test_split(features, target,
test_size=0.30,
random_state=RANDOM_STATE)
# Fit to data and predict using pipelined GNB and PCA.
unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())
unscaled_clf.fit(X_train, y_train)
pred_test = unscaled_clf.predict(X_test)
# Fit to data and predict using pipelined scaling, GNB and PCA.
std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB())
std_clf.fit(X_train, y_train)
pred_test_std = std_clf.predict(X_test)
# Show prediction accuracies in scaled and unscaled data.
print('\nPrediction accuracy for the normal test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))
print('\nPrediction accuracy for the standardized test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std)))
# Extract PCA from pipeline
pca = unscaled_clf.named_steps['pca']
pca_std = std_clf.named_steps['pca']
# Show first principal componenets
def evaluate(model, data, alg = None, classifier="lr",fast=False,ratio = None,cv=10,normalize=False,random_state = None,return_y = False):
    X = model
    Y = data
    micros = []
    macros = []
#    for y,key in enumerate(data.labels.keys()):
#        for index,paper in enumerate(data.labels[key]):
#            if paper not in model.paper2id:
#                print("paper not in model: ", paper)
#                continue
#            X.append(model.paper_embeddings[model.paper2id[paper]])
#            Y.append(y)
    print("len X: ", len(X))
    print("len Y: ", len(Y))
    if normalize:
        X = sk_normalize(X)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    clf = LogisticRegression()
    df = defaultdict(list)
    if ratio is None:
        ratio = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] 
    for r in ratio:
        if r <= 0:
            continue
        elif r >= 1:
            break
            
        micros = []
        macros = []
        for i in range(cv):
            clf = LogisticRegression()
            if classifier.lower() == "svm":
                clf = SVC(cache_size=5000)
            elif classifier.lower() == "mlp":
                clf = MLPClassifier()
            elif classifier.lower() == "nb":
                clf = GaussianNB()

            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1-r,random_state=random_state)
            clf.fit(X_train,Y_train)
            prediction = clf.predict(X_test)
            #lpred = clf.predict_proba(X_test)
            #print("prediction shape: ", prediction[0])
            #print("y_test shape: ", Y_test[0])
            #print("Loss: ", log_loss(Y_test,lpred))
            micro = f1_score(Y_test, prediction, average='micro')
            macro = f1_score(Y_test, prediction, average='macro')
            micros.append(micro)
            macros.append(macro)

        micros = np.mean(micros)
        macros = np.mean(macros)

 
        df["ratio"].append(r)
        df["micro"].append(np.mean(micro))
        df["macro"].append(np.mean(macro))
        #df["alg"].append(alg)
        #df["data"].append(str(data))
        #df["total_samples"] = model.total_samples
        #df["negative"].append(model.negative)
        #df["walk_window"].append(model.walk_window)
        #df["walk_probability"].append(model.walk_probability)
        #df["L2"].append(model.l2)
        logging.info("ratio: %.4f : f1_micro %.4f, f1_macro %.4f" % (r,micros,macros))

    if fast:
        if return_y:
            return micros,macros,Y_test,prediction
        return micros,macros
    else:
        return pd.DataFrame(df)
Example #46
0
def save_data():
    with open('data.csv', 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=' ')
        while len(data) != 0:
            i = random.randint(0, len(data) - 1)
            data[i].insert(0, labels[i])
            writer.writerow(data[i])
            del data[i]
            del labels[i]


data_beg_len = len(data)
if data_beg_len != 0:
    clf1 = SVC()
    clf1.fit(data, labels)
    clf2 = GaussianNB()
    clf2.fit(data, labels)

number = 20

new_input = []


def choose_ans(a):
    ans = [0 for i in range(10)]
    for n in a:
        ans[n] += 1
    num = 0
    for i in range(10):
        if ans[i] > num:
            num = i
print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(scaled_X_train, Y_train)))
print('Accuracy of LDA classifier on test set: {:.2f}'
     .format(lda.score(scaled_X_test, Y_test)))

pred_lda = lda.predict(scaled_X_test)
print(confusion_matrix(Y_test, pred_lda))
print(classification_report(Y_test, pred_lda))


# In[47]:


#fit a naive bayes model
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(scaled_X_train, Y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(scaled_X_train, Y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(scaled_X_test, Y_test)))

pred_gnb = gnb.predict(scaled_X_test)
print(confusion_matrix(Y_test, pred_gnb))
print(classification_report(Y_test, pred_gnb))


# In[48]:


#fit a svm classifier
def evaluate_multilabel(model, data, alg = None, classifier="lr",fast=False,ratio = None, cv = 10, random_state = None,normalize=False):
    X = []
    Y = []
    for pid in range(len(model.word2id)):
        X.append(model.word_embeddings[pid])
        
    Y = np.zeros((len(X),len(data.labels)))
    
    for y,key in enumerate(data.labels.keys()):
        for index,paper in enumerate(data.labels[key]):
            pid = model.word2id[paper]
            Y[pid][y] = 1
    if normalize:
        X = sk_normalize(X)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    df = defaultdict(list)
    if ratio is None:
        ratio = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] 
        
    for r in ratio:
        if r <= 0:
            continue
        elif r >= 1:
            break

        if classifier.lower() == 'lr':
            clf = LogisticRegression()
        elif classifier.lower() == "svm":
            clf = SVC(cache_size=5000)
        elif classifier.lower() == "mlp":
            clf = MLPClassifier()
        elif classifier.lower() == "nb":
            clf = GaussianNB()
            
        micros = []
        macros = []
        for i in range(cv):
            micro,macro = evaluateNodeClassification(X,Y,1-r,clf=clf,random_state = random_state)
            micros.append(micro)
            macros.append(macro)
        micros = np.mean(micros)
        macros = np.mean(macros)
     
        df["ratio"].append(r)
        df["micro"].append(micros)
        df["macro"].append(macros)
        #df["alg"].append(alg)
        #df["data"].append(str(data))
        #df["total_samples"].append(model.total_samples)
        #df["negative"].append(model.negative)
        #df["walk_window"].append(model.walk_window)
        #df["walk_probability"].append(model.walk_probability)   
        #df["L2"].append(model.l2)   
       
        logging.info("ratio: %.4f : f1_micro %.4f, f1_macro %.4f" % (r,micros,macros))
        
        
    if fast:
        return micros,macros
    else:
        return df
        DB[i] = 2
    elif DB[i] == 66:
        DB[i] = 2
    elif DB[i] == 70:
        DB[i] = 3
    elif DB[i] == 74:
        DB[i] = 3
    elif DB[i] == 78:
        DB[i] = 3
    elif DB[i] == 82:
        DB[i] = 4
    elif DB[i] == 86:
        DB[i] = 4

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

#from sklearn.naive_bayes import MultinomialNB
#clf = MultinomialNB()

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
X = sum
y = DB
kf = KFold(n_splits=20)
acc = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    y_test = y_test.ravel()
Example #50
0
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()




#########################################################
### your code goes here ###

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

t0 = time()
pred = clf.predict(features_test)
print "testing time:", round(time()-t0, 3), "s"

accuracy = clf.score(features_test, labels_test)
print "Accuracy: "
print accuracy
#########################################################

Example #51
0
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier

################## load data #####################
iris = datasets.load_iris()
x, y = iris.data[:, 1:3], iris.target

################## define classifier #####################
clf1 = KNeighborsClassifier(n_neighbors=1)

clf2 = RandomForestClassifier(random_state=1)

clf3 = GaussianNB()

lr = LogisticRegression()

sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)

################## class result #####################
for clf, label in zip(
    [clf1, clf2, clf3, sclf],
    ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']):

    scores = model_selection.cross_val_score(clf,
                                             x,
                                             y,
                                             cv=3,
                                             scoring='accuracy')
Example #52
0
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from varname import nameof

sv = SVC()
RFC = RandomForestClassifier()
GaussianN = GaussianNB()
KNC = KNeighborsClassifier(n_neighbors=7)
xgboost = XGBClassifier()
gradientboost = GradientBoostingClassifier()

df = pd.read_csv(r'dataframes/full_csv', index_col=[0])

# with open(r'objects/wektor_lst', 'rb') as f:
#     res_wek = np.load(f)
res_wek = np.load(r'objects/wektors.npy', allow_pickle=True)
res_wek = [wek[0:20] for wek in res_wek]
zzz = np.stack(res_wek)
res_wek = zzz.reshape([7023, 2000])

scoring = ['precision', 'recall', 'f1', 'accuracy']

sv_score_array = cross_validate(sv,
# Import libraries
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
# define data, create model and fit data
X = Variables
Y = Classes
Model = GaussianNB(params).fit(X, Y)
# Score model
Model.score(X, Y)
# Predict new classes
NewY = Model.Predict(NewX)
Example #54
0
def self_projection(
    X,
    cell_types,
    classifier="LR",
    penalty="l1",
    sparsity=0.5,
    fraction=0.5,
    solver="liblinear",
    n=0,
    cv=5,
    whole=False,
    n_jobs=None,
):
    # n = 100 should be good.
    """
    This is the core function for running self-projection.

    Input
    -----
    X: `numpy.array` or sparse matrix
        the expression matrix, e.g. ad.raw.X.
    cell_types: `list of String/int`
        the cell clustering assignment
    classifier: `String` optional (defatul: 'LR')
        a machine learning model in "LR" (logistic regression), \
        "RF" (Random Forest), "GNB"(Gaussion Naive Bayes), "SVM" (Support Vector Machine) and "DT"(Decision Tree).
    penalty: `String` optional (default: 'l2')
        the standardization mode of logistic regression. Use 'l1' or 'l2'.
    sparsity: `fload` optional (default: 0.5)
        The sparsity parameter (C in sklearn.linear_model.LogisticRegression) for the logistic regression model.
    fraction: `float` optional (default: 0.5)
        Fraction of data included in the training set. 0.5 means use half of the data for training,
        if half of the data is fewer than maximum number of cells (n).
    n: `int` optional (default: 100)
        Maximum number of cell included in the training set for each cluster of cells.
        only fraction is used to split the dataset if n is 0.
    cv: `int` optional (default: 5)
        fold for cross-validation on the training set.
        0 means no cross-validation.
    whole: `bool` optional (default: False)
        if measure the performance on the whole dataset (include training and test).
    n_jobs: `int` optional, number of threads to use with the different classifiers (default: None - unlimited).

    return
    -----
    y_prob, y_pred, y_test, clf
    y_prob: `matrix of float`
        prediction probability
    y_pred: `list of string/int`
        predicted clustering of the test set
    y_test: `list of string/int`
        real clustering of the test set
    clf: the classifier model.
    """
    # split the data into training and testing
    if n > 0:
        X_train, X_test, y_train, y_test = train_test_split_per_type(
            X, cell_types, n=n, frac=(1 - fraction))
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, cell_types, stratify=cell_types,
            test_size=fraction)  # fraction means test size
    # set the classifier
    if classifier == "LR":
        clf = LogisticRegression(
            random_state=1,
            penalty=penalty,
            C=sparsity,
            multi_class="ovr",
            solver=solver,
        )
    elif classifier == "RF":
        clf = RandomForestClassifier(random_state=1, n_jobs=n_jobs)
    elif classifier == "GNB":
        clf = GaussianNB()
    elif classifier == "GPC":
        clf = GaussianProcessClassifier(n_jobs=n_jobs)
    elif classifier == "SVM":
        clf = SVC(probability=True)
    elif classifier == "SH":
        clf = SGDClassifier(loss="squared_hinge", n_jobs=n_jobs)
    elif classifier == "PCP":
        clf = SGDClassifier(loss="perceptron", n_jobs=n_jobs)
    elif classifier == "DT":
        clf = DecisionTreeClassifier()

    # mean cross validation score
    cvsm = 0
    if cv > 0:
        cvs = cross_val_score(clf,
                              X_train,
                              np.array(y_train),
                              cv=cv,
                              scoring="accuracy",
                              n_jobs=n_jobs)
        cvsm = cvs.mean()
        print("Mean CV accuracy: %.4f" % cvsm)
    # accuracy on cross validation and on test set
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_train, y_train)
    print("Accuracy on the training set: %.4f" % accuracy)
    accuracy_test = clf.score(X_test, y_test)
    print("Accuracy on the hold-out set: %.4f" % accuracy_test)

    # accuracy of the whole dataset
    if whole:
        accuracy = clf.score(X, cell_types)
        print("Accuracy on the whole set: %.4f" % accuracy)

    # get predicted probability on the test set
    y_prob = None
    if not classifier in ["SH", "PCP"]:
        y_prob = clf.predict_proba(X_test)
    y_pred = clf.predict(X_test)

    return y_prob, y_pred, y_test, clf, cvsm, accuracy_test
from sklearn.metrics import confusion_matrix
cm_Decision_Tree = confusion_matrix(Y_test, Y_pred_Decision_Tree)
#Accuracy score calculation for Decision Tree Model
from sklearn.metrics import accuracy_score
acc_decision_tree = accuracy_score(Y_test,Y_pred_Decision_Tree)
print(acc_decision_tree)

# Fitting Naive Bayes Algorithm to Training set
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_naive_bayes = sc.fit_transform(X_train)
X_test_naive_bayes = sc.transform(X_test)

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train_naive_bayes, Y_train)

# Predicting naive_bayes Test set results
Y_pred_naive_bayes = classifier.predict(X_test_naive_bayes)

# Confusion Matrix for naive_bayes_model
from sklearn.metrics import confusion_matrix
cm_naive_bayes = confusion_matrix(Y_test, Y_pred_naive_bayes)
#Accuracy score calculation for naive_bayes_model
from sklearn.metrics import accuracy_score
acc_naives_bayes = accuracy_score(Y_test,Y_pred_naive_bayes)
print(acc_naives_bayes)

# Fitting Random Forest Classification to Training set
from sklearn.ensemble import RandomForestClassifier
Example #56
0
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

train_df = pd.read_csv('./glass.csv')
X_train = train_df.drop("Type", axis=1)
Y_train = train_df["Type"]

X_train, X_test, Y_train, y_test = train_test_split(X_train,
                                                    Y_train,
                                                    test_size=0.4,
                                                    random_state=0)
gnb = GaussianNB()

y_pred = gnb.fit(X_train, Y_train).predict(X_test)
gnb.fit(X_train, Y_train)
Y_pred = gnb.predict(X_test)
acc_gnb = round(gnb.score(X_train, Y_train) * 100, 2)
print("GNB accuracy is:", acc_gnb)
    return y_pred, y_pred_prob

## function to get classifiers score 

def print_scores(y_test,y_pred,y_pred_prob):
    print('test-set confusion matrix:\n', confusion_matrix(y_test,y_pred)) 
    print("recall score: ", recall_score(y_test,y_pred))
    print("precision score: ", precision_score(y_test,y_pred))
    print("f1 score: ", f1_score(y_test,y_pred))
    print("accuracy score: ", accuracy_score(y_test,y_pred))
    print("ROC AUC: {}".format(roc_auc_score(y_test, y_pred_prob[:,1])))

#%%

# training a naive bayes model for classification 
y_pred, y_pred_prob = get_predictions(GaussianNB(), X_train, y_train, X_test)

print_scores(y_test,y_pred,y_pred_prob)

# Accuracy = 96.91 %


# hence we can see that the model has correclty classified all the 135 values as frauds/ shill bidders

#%%
# training a logistic regression model 
y_pred, y_pred_prob = get_predictions(LogisticRegression(C = 0.01, penalty = 'l1'), X_train, y_train, X_test)

print_scores(y_test,y_pred,y_pred_prob)

# Accuracy = 96.28 %
    if not pca and estimator_name not in ['GaussianNB', 'NeuralNetwork']:
        process_feature_importances(model, estimator_name, pca, fine_tune)

gridsearch_param = {'scoring': 'roc_auc', 'verbose': 2 , 'n_jobs': -1, 'cv': 3}
estimators_params_grid = {
    'LogisticRegression': {'C' : [10**i for i in range(-3,4)], 'penalty': ['l2', 'l1']},
    'DecisionTreeClassifier': {'min_samples_split': [1600, 1800, 2000, 2200, 2400]},
    'RandomForestClassifier': {'n_estimators' : [50,100,200,300,400], 'min_samples_split': [50, 100, 150, 200]},
    'LGBMClassifier': {'num_leaves': [500, 1000, 1500, 2000, 2500], 'n_estimators': [200, 400, 600, 800, 1000]},
    }

print_info('Start experiments')

experiment(LogisticRegression(random_state=SEED, n_jobs=-1, solver='saga', max_iter=500), train_x, train_y, test_x, test_y, pca = False, fine_tune = True)
experiment(DecisionTreeClassifier(random_state=SEED), train_x, train_y, test_x, test_y, pca = False, fine_tune = True)
experiment(GaussianNB(), train_x, train_y, test_x, test_y, pca = False, fine_tune = False)
experiment(RandomForestClassifier(random_state=SEED, n_jobs=-1), train_x, train_y, test_x, test_y, pca = False, fine_tune = True)

lgbm = lgb.LGBMClassifier(objective='binary',
                          random_state = SEED,
                          feature_fraction=0.7,
                          learning_rate=0.05,
                          n_jobs=-1,
                          silent = False,
                          )
experiment(lgbm, train_x, train_y, test_x, test_y, pca = False, fine_tune = True)

""" Bagging with Lightgbm (Combine boosting and bagging)"""
print_info('Start Bagging with Lightgbm')
lgbm = lgb.LGBMClassifier(objective='binary',
                          random_state = SEED,
Example #59
0
a = pd.Series()
x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for i in list(range(1, 11)):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(train_X, train_Y)
    prediction = model.predict(test_X)
    a = a.append(pd.Series(metrics.accuracy_score(prediction, test_Y)))
plt.plot(a_index, a)
plt.xticks(x)
fig = plt.gcf()
fig.set_size_inches(12, 6)
plt.show()
print('Accuracies for different values of n are:', a.values,
      'with the max value as ', a.values.max())

model = GaussianNB()
model.fit(train_X, train_Y)
prediction6 = model.predict(test_X)
print('The accuracy of the NaiveBayes is',
      metrics.accuracy_score(prediction6, test_Y))

model = RandomForestClassifier(n_estimators=100)
model.fit(train_X, train_Y)
prediction7 = model.predict(test_X)
print('The accuracy of the Random Forests is',
      metrics.accuracy_score(prediction7, test_Y))

from sklearn.model_selection import KFold  #for K-fold cross validation
from sklearn.model_selection import cross_val_score  #score evaluation
from sklearn.model_selection import cross_val_predict  #prediction
kfold = KFold(n_splits=10,
Example #60
0
    shuffled_data = data_file.sample(frac=1)
    X = shuffled_data.iloc[1:, 0]  # Features
    y = shuffled_data.iloc[1:, 1]  # Target variable

    # vectorize and split data
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(X)
    feature_names = vectorizer.get_feature_names()
    X_train, X_test, y_train, y_test = train_test_split(X.toarray(),
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    tot_train = np.append(X_train, y_train[:, None], axis=1)

    # train model
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)

    print("P(Not Spam): " + str(gnb.class_prior_[0]))
    print("P(Spam): " + str(gnb.class_prior_[1]) + "\n")

    # separating spam and non-spam instances
    not_spam = tot_train[np.where(tot_train[:, -1] == 0), :-1][0]
    spam = tot_train[np.where(tot_train[:, -1] == 1), :-1][0]

    # smoothing probs
    not_spam = not_spam + 1
    spam = spam + 1
    X_train_smooth = X_train + 1