def NBAccuracy(features_train, labels_train, features_test, labels_test):
    """ compute the accuracy of your Naive Bayes classifier """
    ### import the sklearn module for GaussianNB
    from sklearn.naive_bayes import GaussianNB
    from sklearn.metrics import accuracy_score

    ### create classifier
    clf = GaussianNB() #TODO

    ### fit the classifier on the training features and labels
    #TODO
    
    clf.fit(features_train,labels_train)

    ### use the trained classifier to predict labels for the test features
    pred = clf.predict(features_test)


    ### calculate and return the accuracy on the test data
    ### this is slightly different than the example, 
    ### where we just print the accuracy
    ### you might need to import an sklearn module
    #num = len(pred)
    #corr = 0.0
    #for i in range(len(pred)):
    #    if pred[i] == labels_test[i]:
    #        corr += 1
    #accuracy = corr/num
    accuracy = accuracy_score(pred,labels_test)
    return accuracy
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
def NBAccuracy(features_train, labels_train, features_test, labels_test):
    """ compute the accuracy of your Naive Bayes classifier """
    ### import the sklearn module for GaussianNB
    from sklearn.naive_bayes import GaussianNB

    ### create classifier
    clf = GaussianNB()
    

    ### fit the classifier on the training features and labels
    clf.fit(features_train, labels_train)
    

    ### use the trained classifier to predict labels for the test features
    pred = clf.predict(features_test)


    ### calculate and return the accuracy on the test data
    ### this is slightly different than the example, 
    ### where we just print the accuracy
    ### you might need to import an sklearn module
    intersect = [i for i, j in zip(pred, labels_test) if i == j]
    matched = len(intersect)
    total = len(labels_test)
    accuracy = float(matched) / float(total)
    return accuracy
Example #4
0
def main():
    """
        主函数
    """
    # 准备数据集
    train_data, test_data = utils.prepare_data()

    # 查看数据集
    utils.inspect_dataset(train_data, test_data)

    # 特征工程处理
    # 构建训练测试数据
    X_train, X_test = utils.do_feature_engineering(train_data, test_data)

    print('共有{}维特征。'.format(X_train.shape[1]))

    # 标签处理
    y_train = train_data['label'].values
    y_test = test_data['label'].values

    # 数据建模及验证
    print('\n===================== 数据建模及验证 =====================')
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    y_pred = nb_model.predict(X_test)

    print('准确率:', accuracy_score(y_test, y_pred))
    print('AUC值:', roc_auc_score(y_test, y_pred))
	def scikitNBClassfier(self):
		dataMat, labels = self.loadProcessedData()
		bayesian = Bayesian()
		myVocabList = bayesian.createVocabList(dataMat)
		## 建立bag of words 矩阵
		trainMat = []
		for postinDoc in dataMat:
			trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))

		from sklearn.naive_bayes import GaussianNB

		gnb = GaussianNB()
		X = array(trainMat)
		y = labels

		testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。"
		testEntry = self.testEntryProcess(testText)

		bayesian = Bayesian()
		thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
		## 拟合并预测
		y_pred = gnb.fit(X, y).predict(thisDoc)
		clabels = ['军事', '体育']
		y_pred = gnb.fit(X, y).predict(X)
		print("Number of mislabeled points : %d" % (labels != y_pred).sum())
Example #6
0
def NBAccuracy(features_train, labels_train, features_test, labels_test):
    """ compute the accuracy of your Naive Bayes classifier """
    ### import the sklearn module for GaussianNB
    from sklearn.naive_bayes import GaussianNB

    ### create classifier
    clf = GaussianNB()

    ### fit the classifier on the training features and labels
    clf.fit(features_train, labels_train)

    ### use the trained classifier to predict labels for the test features
    pred = clf.predict(features_test)
    


    ### calculate and return the accuracy on the test data
    ### this is slightly different than the example, 
    ### where we just print the accuracy
    ### you might need to import an sklearn module
    total = len(labels_test)
    correct = (pred == labels_test).sum()
    accuracy = correct/float(total)
    from sklearn.metrics import accuracy_score
    
    accuracy = accuracy_score(labels_test,pred )
    return accuracy
Example #7
0
def categorize(train_data,test_data,train_class,n_features):
    #cf= ExtraTreesClassifier()
    #cf.fit(train_data,train_class)
    #print (cf.feature_importances_)
    
    #lsvmcf = sklearn.svm.LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=100.0)  
    model = LogisticRegression()
    lgr = LogisticRegression(C=100.0,penalty='l1')    
    #knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=10, p=2, metric='minkowski', metric_params=None)
    svmlcf = sklearn.svm.SVC(C=1000.0, kernel='linear', degree=1, gamma=0.01,  probability=True)#2
    svmcf = sklearn.svm.SVC(C=1000.0, kernel='rbf', degree=1, gamma=0.01,  probability=True)#2
    cf = DecisionTreeClassifier() 
    dct = DecisionTreeClassifier(criterion='gini', splitter='best',  min_samples_split=7, min_samples_leaf=4)
    rf = RandomForestClassifier(n_estimators=10, criterion='gini',  min_samples_split=7, min_samples_leaf=4, max_features='auto')
    gnb = GaussianNB()  #1
    adbst = sklearn.ensemble.AdaBoostClassifier(base_estimator=rf, n_estimators=5, learning_rate=1.0, algorithm='SAMME.R', random_state=True)

    #ch2 = SelectKBest(chi2, k=n_features)
    #train_data = ch2.fit_transform(train_data, train_class)
    #test_data = ch2.transform(test_data)

    #rfe = RFE(svmlcf,n_features)
    #rfe = rfe.fit(train_data, train_class)
    gnb.fit(train_data,train_class)
    return gnb.predict(test_data)
class GaussianColorClassifier(ContourClassifier):
    '''
    A contour classifier which classifies a contour
    based on it's mean color in BGR, HSV, and LAB colorspaces,
    using a Gaussian classifier for these features.

    For more usage info, see class ContourClassifier
    '''
    FEATURES = ['B', 'G', 'R', 'H', 'S', 'V', 'L', 'A', 'B']

    def __init__(self, classes, **kwargs):
        super(GaussianColorClassifier, self).__init__(classes, **kwargs)
        self.classifier = GaussianNB()

    def get_features(self, img, mask):
        mean = cv2.mean(img, mask)
        mean = np.array([[mean[:3]]], dtype=np.uint8)
        mean_hsv = cv2.cvtColor(mean, cv2.COLOR_BGR2HSV)
        mean_lab = cv2.cvtColor(mean, cv2.COLOR_BGR2LAB)
        features = np.hstack((mean.flatten(), mean_hsv.flatten(), mean_lab.flatten()))
        return features

    def classify_features(self, features):
        return self.classifier.predict(features)

    def feature_probabilities(self, features):
        return self.classifier.predict_proba(features)

    def train(self, features, classes):
        self.classifier.fit(features, classes)
Example #9
0
def NB_experiment(data_fold, train, test, dumper):

    print "Ready to find the Best Parameters for Naive Bayes"

    print 'Gaussian Naive Bayes'
    nb = GNB()
    print "fitting NaiveBayes Experiment"

    dumper.write('Classifier: Naive Bayes\n')
    scores = cross_validation.cross_val_score(nb, train[0], train[1], 
                                              cv = data_fold, score_func=accus)

    reports = "Accuracy on Train: %0.2f (+/- %0.2f)"%(scores.mean(), scores.std()/2)
    print reports

    dumper.write(reports+'\n')
    reports = " ".join(['%0.2f'%(item) for item in scores])
    dumper.write(reports+'\n')
    
    nb = GNB()
    nb.fit(train[0], train[1])
    
    pred = clf_test(nb, test)
    output_ranking(pred, codecs.open('nb.ranking', 'w', 'utf-8'))
    return None
Example #10
0
def getGaussianPred(featureMatrix, labels, testSet, testSet_docIndex):
    """
    All input arguments are return of getTrainTestData()
    :param featureMatrix:
    :param labels:
    :param testSet:
    :param testSet_docIndex:
    :return docIndexPred: dict{docid: [index1, index2, ...], ...}
                        key is docid
                        value is all cognates' index
    """
    gnb = GaussianNB()
    gnb.fit(featureMatrix, labels)
    # pred = gnb.predict(featureMatrix)
    pred = gnb.predict(testSet)

    docIndexPred = dict()

    for i, p in enumerate(pred):
        if p:
            docid = testSet_docIndex[i, 0]
            index = testSet_docIndex[i, 1]
            if docid in docIndexPred:
                docIndexPred[docid].append(index)
            else:
                docIndexPred[docid] = [index]

    return docIndexPred
Example #11
0
def NBAccuracy(features_train, labels_train, features_test, labels_test):
    """ compute the accuracy of your Naive Bayes classifier """
    ### import the sklearn module for GaussianNB
    from sklearn.naive_bayes import GaussianNB

    ### create classifier
    clf = GaussianNB()

    t0 = time()
    ### fit the classifier on the training features and labels
    clf.fit(features_train, labels_train)
    print "training time:", round(time()-t0, 3), "s"

    ### use the trained classifier to predict labels for the test features
    import numpy as np
    t1 = time()
    pred = clf.predict(features_test)
    print "predicting time:", round(time()-t1, 3), "s"

    ### calculate and return the accuracy on the test data
    ### this is slightly different than the example,
    ### where we just print the accuracy
    ### you might need to import an sklearn module
    accuracy = clf.score(features_test, labels_test)
    return accuracy
Example #12
0
def NBAccuracy(features_train, labels_train, features_test, labels_test):
	#Import sklearn modules for GaussianNB
	from sklearn.naive_bayes import GaussianNB
	from sklearn.metrics import accuracy_score
	
	#Create classifer
	classifer = GaussianNB();
	
	#Timing fit algorithm
	t0 = time();
	
	#Fit classier on the training features
	classifer.fit(features_train, labels_train);
	
	print "Training Time: ", round(time() - t0, 3), "s";
	
	GaussianNB();
	
	#Timing prediction algorithm
	t0=time();
	
	#Use trained classifer to predict labels for test features
	pred = classifer.predict(features_test);
	
	print "Prediction Time: ", round(time() - t0, 3), "s";
	
	#Calculate accuracy from features_test with answer in labels_test
	
	accuracy = accuracy_score(pred, labels_test);
	
	return accuracy;
def performNB(trainingScores, trainingResults, testScores):
	print "->Gaussian NB"
	X = []
	for currMark in trainingScores:
		pass
	for idx in range(0, len(trainingScores[currMark])):
		X.append([])

	for currMark in trainingScores:
		if "Asym" in currMark:
			continue
		print currMark, 
		for idx in range(0, len(trainingScores[currMark])):
			X[idx].append(trainingScores[currMark][idx])

	X_test = []
	for idx in range(0, len(testScores[currMark])):
		X_test.append([])

	for currMark in trainingScores:
		if "Asym" in currMark:
			continue
		for idx in range(0, len(testScores[currMark])):
			X_test[idx].append(testScores[currMark][idx])
	gnb = GaussianNB()
	gnb.fit(X, np.array(trainingResults))
	y_pred = gnb.predict_proba(X_test)[:, 1]
	print "->Gaussian NB"
	return y_pred
Example #14
0
def NB(text):
    ### features_train and features_test are the features for the training
    ### and testing datasets, respectively
    ### labels_train and labels_test are the corresponding item labels
    features_train, features_test, labels_train, labels_test = Preprocess()
    Ifeatures_train,Ifeatures_test,Ilabels_train=preprocess_input([text])

    # classification goes here

    clf = GaussianNB()

    # training
    train_t0 = time()
    clf.fit(features_train, labels_train)
    train_t1 = time()

    # prediction or testing
    test_t0 = time()
    predict = clf.predict(features_test)
    test_t1 = time()

    print "accuracy: ", clf.score(features_test, labels_test)
    print "#################################"
    print "tain time: ", round(train_t1 - train_t0, 3), "s"
    print "prediction time: ", round(test_t1 - test_t0, 3), "s"

    print "#################################"

    clf.fit(Ifeatures_train,Ilabels_train)
    print ("prediction of ",str(clf.predict(Ifeatures_test))[1])

    #print "prediction of ", clf.predict(preprocess_input(text))
    return  str(clf.predict(Ifeatures_test))[1]
class GaussianNBClassifier:

	def __init__(self):
		"""
		This is the constructor responsible for initializing the classifier
		"""
		self.outputHeader = "#gnb"
		self.clf = None

	def buildModel(self):
		"""
		This builds the model of the Gaussian NB classifier
		"""
		self.clf =  GaussianNB()

	def trainGaussianNB(self,X, Y):
		"""
		Training the Gaussian NB Classifier
		"""
		self.clf.fit(X, Y)

	def validateGaussianNB(self,X, Y):
		"""
		Validate the Gaussian NB Classifier
		"""
		YPred = self.clf.predict(X)
		print accuracy_score(Y, YPred)

	def testGaussianNB(self,X, Y):
		"""
		Test the Gaussian NB Classifier
		"""
		YPred = self.clf.predict(X)
		print accuracy_score(Y, YPred)
def classify(features_train, labels_train):
    clf = GaussianNB()
    clf.fit(features_train, labels_train)
    ### import the sklearn module for GaussianNB
    ### create classifier
    ### fit the classifier on the training features and labels
    return clf
Example #17
0
def naive_bayes(features, labels):
    classifier = GaussianNB()
    classifier.fit(features, labels)
    scores = cross_validation.cross_val_score(
        classifier, features, labels, cv=10, score_func=metrics.precision_recall_fscore_support
    )
    print_table("Naive Bayes", numpy.around(numpy.mean(scores, axis=0), 2))
def test_gnb_prior():
    # Test whether class priors are properly set.
    clf = GaussianNB().fit(X, y)
    assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8)
    clf.fit(X1, y1)
    # Check that the class priors sum to 1
    assert_array_almost_equal(clf.class_prior_.sum(), 1)
Example #19
0
def nb_names():
	#generate list of tuple names
	engine = create_engine('sqlite:///names.db')
	DBSession = sessionmaker(bind=engine)
	session = DBSession()
	db_names = names.Names.getAllNames(session)
	names_list = [(x,'name') for x in db_names]
	words_list = generate_words()
	sample_names = [names_list[i] for i in sorted(random.sample(xrange(len(names_list)), len(words_list)))]

	data = sample_names + words_list
	shuffled_data = np.random.permutation(data)
	strings = []
	classification = []
	for item in shuffled_data:
		strings.append([item[0]])
		classification.append(str(item[1]))


	X = np.array(strings)
	Y = np.array(classification)

	print X,Y
	clf = GaussianNB()
	clf.fit(X, Y)
Example #20
0
def trainNB():
    

    featureVector = []
    classVector = []
    temp= []
    headerLine = True


    #training
    train = open(r'C:\Python34\alchemyapi_python\TrainingDataDummy.csv')

    for line in train:
        if(headerLine):
            headerLine = False
        else:
            temp = line.split(",")
            x = [float(temp[i]) for i in activeFeatureIndex]
            #print(x)
            featureVector.append(x)
            #temp = [int(x) for x in line.split(",")[-1].rstrip("\n")]
            classVector.append(int(line.split(",")[-1].rstrip("\n")))

        
    fVector = np.array(featureVector)
    cVector = np.array(classVector)
    #print(classVector)
    print(fVector.shape)
    print(cVector.shape)

    clf = GaussianNB()
    clf.fit(fVector,cVector)
    train.close()

    return clf
Example #21
0
class CruiseAlgorithm(object):
	# cruise algorithm is used to classify the cruise phase vs noncruise phase, it uses the differential change in data stream as the input matrix
	def __init__(self, testing=False):
		self.core = GaussianNB()
		self.scaler = RobustScaler()
		self.X_prev = None
		self.testing = testing
	def fit(self,X,Y): # Y should be the label of cruise or not
		X = self.prepare(X)
		self.core.fit(X,Y.ravel())
	def predict(self, X):
		if self.testing:
			X_t = self.prepare(X)
		else:
			if self.X_prev:
				X_t = X - self.X_prev
			else:
				X_t = X
			self.X_prev = X

		print repr(X_t)
		prediction_result = self.core.predict(X_t)
		return np.asmatrix(prediction_result)

	def prepare(self,X):
		a = np.zeros((X.shape[0],X.shape[1]))
		for i in xrange(X.shape[0]-1):
			a[i+1,:] = X[i+1] - X[i]
		return a
def selectKBest(previous_result, data):
	# remove 'restricted_stock_deferred' and 'director_fees'
	previous_result.pop(4)
	previous_result.pop(4)

	result = []
	_k = 10
	for k in range(0,_k):
		feature_list = ['poi']
		for n in range(0,k+1):
			feature_list.append(previous_result[n][0])

		data = featureFormat(my_dataset, feature_list, sort_keys = True, remove_all_zeroes = False)
		labels, features = targetFeatureSplit(data)
		features = [abs(x) for x in features]
		from sklearn.cross_validation import StratifiedShuffleSplit
		cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
		features_train = []
		features_test  = []
		labels_train   = []
		labels_test    = []
		for train_idx, test_idx in cv:
			for ii in train_idx:
				features_train.append( features[ii] )
				labels_train.append( labels[ii] )
			for jj in test_idx:
				features_test.append( features[jj] )
				labels_test.append( labels[jj] )
		from sklearn.naive_bayes import GaussianNB
		clf = GaussianNB()
		clf.fit(features_train, labels_train)
		predictions = clf.predict(features_test)
		score = score_func(labels_test,predictions)
		result.append((k+1,score[0],score[1],score[2]))
	return result
Example #23
0
class RegularizedGaussianNB:
  """
  Three types of regularization are possible:
    - regularized the variance of a feature within a class toward the 
      average variance of all features from that class
    - regularize the variance of a feature within a class toward its
      pooled variance across all classes
    - add some constant amount of variance to each feature
  In practice, the latter seems to work the best, though the regularization
  value should be cross-validated. 
  """
  def __init__(self, avg_weight = 0, pooled_weight = 0, extra_variance = 0.1):
    self.pooled_weight = pooled_weight
    self.avg_weight = avg_weight
    self.extra_variance = extra_variance
    self.model = GaussianNB()
    
  def fit(self, X,Y):
    self.model.fit(X,Y)
    p = self.pooled_weight
    a = self.avg_weight
    ev = self.extra_variance 
    original_weight = 1.0 - p - a
    pooled_variances = np.var(X, 0)
    for i in xrange(self.model.sigma_.shape[0]):
      class_variances = self.model.sigma_[i, :]
      new_variances = original_weight*class_variances + \
        p * pooled_variances + \
        a * np.mean(class_variances) + \
        ev 
      self.model.sigma_[i, :] = new_variances
        
        
  def predict(self, X):
    return self.model.predict(X)
Example #24
0
def test_classification():
    t = zeros(len(target))
    t[target == 'setosa'] = 1
    t[target == 'versicolor'] = 2
    t[target == 'virginica'] = 3

    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(data,t) # training on the iris dataset

    print classifier.predict(data[0])
    print t[0]


    from sklearn import cross_validation
    train, test, t_train, t_test = cross_validation.train_test_split(data, t, test_size=0.4, random_state=0)

    classifier.fit(train,t_train) # train
    print classifier.score(test,t_test) # test

    from sklearn.metrics import confusion_matrix
    print confusion_matrix(classifier.predict(test),t_test)

    from sklearn.metrics import classification_report
    print classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica'])

    from sklearn.cross_validation import cross_val_score
    # cross validation with 6 iterations 
    scores = cross_val_score(classifier, data, t, cv=6)
    print scores

    from numpy import mean
    print mean(scores)
Example #25
0
def simple_svm_train(emotion, training_set):

	song_list = []
	sizes_list = []
	other_emotions = []

	# print 'Start to sample set'
	# Setting up the data
	sampled_dict = create_sample_dict(training_set)
	# print 'Set sampled, extracting features'
	feature_vector, class_vector, test_values, test_class = extract_features(sampled_dict, emotion, training_set)

	# Creating the classifier using sklearn
	# print 'Extracted features, training classifier'
	clf = GaussianNB()
	clf.fit(feature_vector,class_vector)

	# clf = svm.SVC(max_iter = 10000)
	# clf.fit(feature_vector,class_vector)
	# print 'Finished training classifier'


	# Testing and analyzing results
	results = test_classifier(clf, emotion, test_values)
	return  post_process_results(results, emotion)
Example #26
0
 def MyNaiveBayes(object):
     pre = PreProcess()
     (training_value, test_value, test_pos_x, test_pos_y, training_pos_x, training_pos_y) = pre.split()
     # 模型初始化
     clf_x = GaussianNB()
     clf_y = GaussianNB()
     # 进行模型的训练
     clf_x.fit(training_value, training_pos_x)
     clf_y.fit(training_value, training_pos_y)
     # 计算结果
     result_pos_x = clf_x.predict(test_value)
     result_pos_y = clf_y.predict(test_value)
     '''
     print result_pos_x
     print test_pos_x
     print result_pos_y
     print test_pos_y
     '''
     # 计算误差
     x_dis = []
     y_dis = []
     d_dis = []
     for i in range(len(result_pos_x)):
         x_dis.append(abs(result_pos_x[i] - test_pos_x[i]))
         y_dis.append(abs(result_pos_y[i] - test_pos_y[i]))
         d_dis.append(math.sqrt((result_pos_x[i]-test_pos_x[i])**2+(result_pos_y[i]-test_pos_y[i])**2))
     x = (sum(x_dis))/len(result_pos_x)
     y = (sum(y_dis))/len(result_pos_y)
     d = (sum(d_dis))/len(d_dis)
     print x, y, d
     return x, y, d
Example #27
0
def myClassifier(X,Y,model,CV=4, scoreType='pure'):
    # X = [[0, 0], [1, 1],[1, 2]]
    # y = [0, 1, 2]
    score = {}
    print "Error Analysis using", scoreType
    if model == "SVM":
        clf = svm.SVC(probability=True, random_state=0, kernel='rbf')        
        #clf = svm.SVR(cache_size=7000)        
        
    elif model == "LR":
        clf = linear_model.LogisticRegression()
        clf.fit(X, Y)        

    elif model == "NB":
         clf = GaussianNB()
         clf.fit(X, Y)
         
    elif model=='MLP': # multilayer perceptron
         clf = MLPClassifier( hidden_layer_sizes=[100],algorithm='l-bfgs')
         clf.fit(X, Y)
    
    if scoreType == 'cv':     
        accu = np.mean(cross_validation.cross_val_score(clf, X, Y, scoring='accuracy',cv=CV))
    elif scoreType == 'pure':  
        predictions=clf.predict(X)
        accu = sum([int(predictions[q]==Y[q]) for q in range(len(Y))])/len(Y)        
    return accu, clf
def createNaiveBayesModel(feature_vector_data):
    '''
        Uses the dimensionally reduced feature vectors of each of the instance, sense id pairs
        to create a naive bayes model
    '''
    naive_bayes_model_word_type = {}
    
    for word_type, instance_sense_dict in feature_vector_data.iteritems():
        vectors = []
        senses  = []
        
        for i in xrange(len(instance_sense_dict)):
            sense = instance_sense_dict.keys()[i][1]
            data_type = instance_sense_dict.keys()[i][2]
            
            #Need to grab the TSNE vectors and senses of only the training data
            #Thus, we ignore all the validation data
            if  data_type == "training":
                vectors.append(instance_sense_dict.values()[i])
                senses.append(sense)
            
        vectors = np.array(vectors)
        senses = np.array(senses)
        nb = GaussianNB()
        nb.fit(vectors, senses)
        naive_bayes_model_word_type[word_type] = nb
    
    return naive_bayes_model_word_type
Example #29
0
def boundaries():
    # import some data to play with
    iris = datasets.load_iris()
    X = iris.data[:, :2] 
    y = iris.target    
    h = .02
    means = np.empty((X.shape[1], len(set(y))))
    for i,lab in enumerate(list(set(y))):
        means[:,i] = X[y==lab].mean(axis=0)
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    nb = GaussianNB()
    nb.fit(X, y)
    Z = nb.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
    plt.scatter(means[0,:], means[1,:])
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.savefig("decision_boundary.pdf")
    plt.clf()
def univariateFeatureSelection(f_list, my_dataset):
	result = []
	for feature in f_list:
		# Replace 'NaN' with 0
		for name in my_dataset:
			data_point = my_dataset[name]
			if not data_point[feature]:
				data_point[feature] = 0
			elif data_point[feature] == 'NaN':
				data_point[feature] =0

		data = featureFormat(my_dataset, ['poi',feature], sort_keys = True, remove_all_zeroes = False)
		labels, features = targetFeatureSplit(data)
		features = [abs(x) for x in features]
		from sklearn.cross_validation import StratifiedShuffleSplit
		cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
		features_train = []
		features_test  = []
		labels_train   = []
		labels_test    = []
		for train_idx, test_idx in cv:
			for ii in train_idx:
				features_train.append( features[ii] )
				labels_train.append( labels[ii] )
			for jj in test_idx:
				features_test.append( features[jj] )
				labels_test.append( labels[jj] )
		from sklearn.naive_bayes import GaussianNB
		clf = GaussianNB()
		clf.fit(features_train, labels_train)
		predictions = clf.predict(features_test)
		score = score_func(labels_test,predictions)
		result.append((feature,score[0],score[1],score[2]))
	result = sorted(result, reverse=True, key=lambda x: x[3])
	return result
accuracy5 = clf5.score(X_test,y_test)

clf6.fit(X_train,y_train)
accuracy6 = clf6.score(X_test,y_test)

clf7.fit(X_train,y_train)
accuracy7 = clf7.score(X_test,y_test)

clf8.fit(X_train,y_train)
accuracy8 = clf8.score(X_test,y_test)
print(accuracy1,accuracy2,accuracy3,accuracy4,accuracy5,accuracy6,accuracy7,accuracy8)

from sklearn.naive_bayes import GaussianNB

clfnb = GaussianNB()
clfnb.fit(X_train, y_train)
accuracyNB = clfnb.score(X_test,y_test)

print("In Gaussian NB")
print (accuracyNB)


##WITH TruncatedSVD + KNN
from sklearn.decomposition import PCA, FastICA,TruncatedSVD
from sklearn.pipeline import Pipeline
trun = TruncatedSVD()
dm_reductions = [trun]  
clf_details = [clf]
estimators = [('dm_reduce', trun), ('clf', clf)]
pipeline = Pipeline(estimators)        
best_pipe = pipeline.fit(X_train, y_train)
n = [1500,5000,7000,10000,20000]
y_data[y_data == 6] = 0
y_data[y_data == 7] = 1
y_data[y_data == 8] = 1
y_data[y_data == 9] = 2
y_data[y_data == 10] = 2
df = pd.DataFrame(y_data)
fs = []
acc = []
df=df.astype('int')
genes_transpose = np.transpose(x_data)
for i in range(0,5):
    X_new = SelectKBest(chi2, k=n[i]).fit_transform(genes_transpose, df)
    classifier = GaussianNB()             
    X_train, X_test,y_train,y_test = train_test_split(X_new,df,test_size=0.3)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    cnf_matrix = confusion_matrix(y_test, y_pred)      
    cnf_matrix.astype(float)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)   
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)        
    FP = FP.astype('float')                                      
    FN = FN.astype('float')          
    TP = TP.astype('float')            
    TN = TN.astype('float')          
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
Example #33
0
Y = df.iloc[:, -1]

# Encoding categorical values
X = pd.get_dummies(X, columns=['Gender'], drop_first=True)

# Train-Test-Split
X_train = X.sample(frac=0.8, random_state=1)
X_test = X.drop(X_train.index)
Y_test = Y.drop(X_train.index)
Y_train = Y.drop(Y_test.index)

X_train = X_train.sort_index()

# Scaling values
from sklearn.preprocessing import StandardScaler
Sc_X = StandardScaler()
X_train = Sc_X.fit_transform(X_train)
X_test = Sc_X.transform(X_test)

# Making a Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
c_m = confusion_matrix(Y_test, Y_pred)

print(c_m)
Example #34
0
def domestic_model_initialise():
    connection = pymysql.connect(host='localhost',
                                 user='******',
                                 password='',
                                 db='crickml',
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)
    try:
        with connection.cursor() as cursor:
            # Read a single record
            sql = "SELECT * FROM `domestic_stats`"
            cursor.execute(sql)
            result = cursor.fetchall()
            player_list = []
            for player in result:
                career_score = batsmen_model(player['overall_matches'],
                                             player['overall_innings'],
                                             player['overall_average'],
                                             player['overall_100s'],
                                             player['overall_50s'])
                player_list.append([
                    player['overall_average'] * player['overall_strike_rate'],
                    career_score
                ])

        with connection.cursor() as cursor:
            # Read a single record
            sql = "SELECT * FROM `domestic_stats`"
            cursor.execute(sql)
            result = cursor.fetchall()
            intl_performance_list = []
            performance_list = []
            for player in result:
                performance_score = batsmen_performance_model(
                    player['intl_matches'], player['intl_innings'],
                    player['intl_average'], player['intl_100s'],
                    player['intl_50s'])

                intl_performance_list.append([performance_score])
    finally:
        print('done')
        # return pre_performance

    np_intl_performances_list = np.array(intl_performance_list)
    mean_performance = sum(
        np_intl_performances_list[:, 0]) / len(np_intl_performances_list)

    for performance in intl_performance_list:
        if (performance <= mean_performance):
            performance_list.append(0)
        else:
            performance_list.append(1)

    # finally:
    #     connection.close()
    # # print(np_players)
    np_players = np.array(player_list)
    np_players = np_players.astype(float)
    np_performances = np.array(performance_list)

    max_batting_pos = np.max(np_players[:, 0])
    max_milestone_score = np.max(np_players[:, 1])

    for player in np_players:
        batting_pos_score = player[0]
        batting_milestone_score = player[1]
        # batting_runs_score = player[2]
        # print(batting_pos_score)
        # print(max_batting_pos)

    normalized_batting_pos_score = batting_pos_score / max_batting_pos
    normalized_batting_milestone_score = batting_milestone_score / max_milestone_score
    # normalized_runs_score = batting_runs_score/max_runs_score

    # print(normalized_batting_pos_score)
    # exit()
    player[0] = normalized_batting_pos_score
    player[1] = normalized_batting_milestone_score
    # player[2] = normalized_runs_score

    sm = SMOTE(random_state=42)
    np_players_resampled, np_performances_resampled = sm.fit_resample(
        np_players, np_performances)

    feature_train, feature_test, target_train, target_test = train_test_split(
        np_players_resampled,
        np_performances_resampled,
        test_size=0.20,
        random_state=42)

    print("Training Domestoc Models")
    print(feature_test)
    svm_clf = SVC(C=1000, kernel='sigmoid', gamma=0.001, probability=True)
    svm_clf.fit(feature_train, target_train)
    svm_pred = svm_clf.predict(feature_test)
    svm_pred_prob = svm_clf.predict_proba(feature_test)
    # print(svm_pred_prob)
    # acc = accuracy_score(svm_pred, target_test)
    # print('Accuracy :', acc)
    # print(classification_report(target_test, svm_pred))

    gnb = GaussianNB()
    gnb.fit(feature_train, target_train)
    nb_pred_prob = gnb.predict_proba(feature_test)
    nb_pred = gnb.predict(feature_test)
    # acc = accuracy_score(nb_pred, target_test)
    # print('Accuracy :', acc)
    # print(classification_report(target_test, nb_pred))

    desT = DecisionTreeClassifier()
    desT.fit(feature_train, target_train)
    desc_pred = desT.predict(feature_test)
    desc_pred_prob = desT.predict_proba(feature_test)

    mlp_clf = MLPClassifier(solver='lbfgs',
                            alpha=1e-5,
                            hidden_layer_sizes=(5, 2),
                            random_state=1)
    mlp_clf.fit(feature_train, target_train)
    mlp_pred_prob = mlp_clf.predict_proba(feature_test)
    mlp_pred = mlp_clf.predict(feature_test)

    # acc = accuracy_score(desc_pred, target_test)
    # print('Accuracy :', acc)
    # print(classification_report(target_test, desc_pred))

    miss_nb = 0
    for index, pred in enumerate(nb_pred):
        if (pred != target_test[index]):
            miss_nb += 1
    amt_say_nb = 1 / 2 * (math.log((1 - (miss_nb / 119)) / (miss_nb / 119)))

    miss_mlp = 0
    for index, pred in enumerate(mlp_pred):
        if (pred != target_test[index]):
            miss_mlp += 1

    amt_say_mlp = 1 / 2 * (math.log((1 - (miss_mlp / 119)) / (miss_mlp / 119)))

    miss_svm = 0
    for index, pred in enumerate(svm_pred):
        if (pred != target_test[index]):
            miss_svm += 1

    amt_say_svm = 1 / 2 * (math.log((1 - (miss_svm / 119)) / (miss_svm / 119)))

    miss_desc = 0
    for index, pred in enumerate(desc_pred):
        if (pred != target_test[index]):
            miss_desc += 1

    amt_say_desc = 1 / 2 * (math.log(
        (1 - (miss_desc / 119)) / (miss_desc / 119)))

    print('Amount of say NB :', amt_say_nb)
    print('Amount of say MLP :', amt_say_mlp)
    print('Amount of say SVM :', amt_say_svm)
    print('Amount of say Descision Tree :', amt_say_desc)

    return connection, gnb, mlp_clf, svm_clf, desT, amt_say_desc, amt_say_mlp, amt_say_nb, amt_say_svm, max_batting_pos, max_milestone_score, feature_train, feature_test, target_train, target_test
Example #35
0

X=[]
Y=[]
i=0
with open(sys.argv[1], "r") as ins:
    for line in ins:
        line = line.strip()
        line1 = line.split(',')
        if(i==0):
            i+=1
            continue
        X.append(map(int,line1[:-1]))
        Y.append(int(line1[-1]))
clf = GaussianNB()
clf.fit(X, Y)
already = "../../Suites/Ccausalmarital"
num_atr=[10,8,70,16,7,14,6,5,2,100,40,100,40]
map={}
def check_ratio(fixed,clf):
    if option==3 or option==4:
        fin = open(already,"r")
        requeried={}
        num=0
        den=0
        for line in fin:
            line = line.strip()
            line = line.split(',')
            line = line[:-1]
            i=0
            pos=0
Example #36
0
Y = [
    'male', 'female', 'female', 'female', 'male', 'male', 'male', 'female',
    'male', 'female', 'male'
]

#classifiers
clf_tree = tree.DecisionTreeClassifier()
clf_svc = svm.SVC()
clf_KNN = KNeighborsClassifier()
clf_NB = GaussianNB()

#training the models
clf_tree = clf_tree.fit(X, Y)
clf_svc = clf_svc.fit(X, Y)
clf_KNN = clf_KNN.fit(X, Y)
clf_NB = clf_NB.fit(X, Y)

prediction_tree = clf_tree.predict(X)
prediction_svc = clf_svc.predict(X)
prediction_KNN = clf_KNN.predict(X)
prediction_NB = clf_NB.predict(X)

result = accuracy_score(Y, prediction_tree)
result1 = accuracy_score(Y, prediction_svc)
result2 = accuracy_score(Y, prediction_KNN)
result3 = accuracy_score(Y, prediction_NB)

print(result)
print(result1)
print(result2)
print(result3)


#OUTPUT:-
#MODEL-1: Accuracy of LogisticRegression :  77.09





#MODEL-2) Gaussian Naive Bayes
#------------------------------------------
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_val)
acc_gaussian = round(accuracy_score(y_pred, y_val) * 100, 2)
print( "MODEL-2: Accuracy of GaussianNB : ", acc_gaussian  )

#OUTPUT:-
#MODEL-2: Accuracy of GaussianNB : 78.68




#MODEL-3) Support Vector Machines
#------------------------------------------
from sklearn.svm import SVC

svc = SVC()
Example #38
0
#Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
import numpy as np

#assigning predictor and target variables
x = np.array([[-3, 7], [1, 5], [1, 2], [-2, 0], [2, 3], [-4, 0], [-1, 1],
              [1, 1], [-2, 2], [2, 7], [-4, 1], [-2, 7]])
Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])
#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(x, Y)

#Predict Output
predicted = model.predict([[1, 2], [3, 4]])
print(predicted)
def analysis():
    
    (train_original, test_original, full_data) = featureextraction(False)
    for i in range(5):
        test = train_original.iloc[178*i:178*(i+1),:].copy()
        test = test.drop(labels=["Survived"],axis = 1)
        train = train_original.loc[~train_original['PassengerId'].isin(test['PassengerId'])]
        X_train = train.drop("Survived", axis=1)
        X_train = X_train.drop("PassengerId", axis=1).copy()
        Y_train = train["Survived"]
        X_test  = test.drop("PassengerId", axis=1).copy()
        X_train.shape, Y_train.shape, X_test.shape
    
        logreg = LogisticRegression()
        logreg.fit(X_train, Y_train)
        Y_pred = logreg.predict(X_test)
        acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
        print('Logistic regression:',acc_log,'%')
        
        
        svc = SVC()
        svc.fit(X_train, Y_train)
        Y_pred = svc.predict(X_test)
        acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
        print('SVC:',acc_svc,'%')
        
        for k in range(3,8,2):
            knn = KNeighborsClassifier(n_neighbors = k)
            knn.fit(X_train, Y_train)
            Y_pred = knn.predict(X_test)
            acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
            print('%s KNeighbors:'%k,acc_knn,'%')
        
        decision_tree = DecisionTreeClassifier()
        decision_tree.fit(X_train, Y_train)
        Y_pred = decision_tree.predict(X_test)
        acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
        print('Decision Tree:',acc_decision_tree,'%')
        
        random_forest = RandomForestClassifier(n_estimators=100)
        random_forest.fit(X_train, Y_train)
        Y_pred = random_forest.predict(X_test)
        random_forest.score(X_train, Y_train)
        acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
        print('Random Forest:',acc_random_forest,'%')
        
        Naive_bayes = GaussianNB()
        Naive_bayes.fit(X_train, Y_train)
        Y_pred = Naive_bayes.predict(X_test)
        Naive_bayes.score(X_train, Y_train)
        acc_Naive_bayes = round(Naive_bayes.score(X_train, Y_train) * 100, 2)
        print('Naive Bayes:',acc_Naive_bayes,'%')
        
        MLP = MLPClassifier(hidden_layer_sizes=(15,15,15))
        MLP.fit(X_train, Y_train)
        Y_pred = MLP.predict(X_test)
        MLP.score(X_train, Y_train)
        acc_MLP = round(MLP.score(X_train, Y_train) * 100, 2)
        print('MLP:',acc_MLP,'%')
        print('\n')
        
    X_train = train_original.drop("Survived", axis=1)
    X_train = X_train.drop("PassengerId", axis=1)
    Y_train = train_original["Survived"]
    X_test  = test_original.drop("PassengerId", axis=1).copy()
    X_train.shape, Y_train.shape, X_test.shape
    
    Submission_classifier =  KNeighborsClassifier(n_neighbors = 7)
    Submission_classifier.fit(X_train, Y_train)
    Y_pred = Submission_classifier.predict(X_test)
    Submission_classifier.score(X_train, Y_train)
    Submission_classifier_score = round(Submission_classifier.score(X_train, Y_train) * 100, 2)
    
    submission = pd.DataFrame({
        "PassengerId": test_original["PassengerId"],
        "Survived": Y_pred.astype(int)
    })
    submission.to_csv('submission_KNN_2.csv', index=False)
    print('Submission accuracy:',Submission_classifier_score,'%')
def trainModel(X, results):
    print 'Building model...'
    clf = GaussianNB()
    clf.fit(X, results)
    return clf
my_data = pd.read_csv('C:\Projects\ML-BinaryClassification\iris\Iris.csv')

#split the dataset into features and labels
features = my_data.iloc[:, :5]

labels = my_data[my_data.columns[-1]]

# Split our data
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.20,
                                                          random_state=42)

#print(test_labels)
#print (features)
#print (labels)

# Initialize our classifier
gnb = GaussianNB()

# Train our classifier
model = gnb.fit(train, train_labels)

# Make predictions
print(test)
preds = gnb.predict(test)
print(preds)

# Evaluate accuracy
print(accuracy_score(test_labels, preds))
Example #42
0
# train and test dataset splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)
#feature scaler
from sklearn.preprocessing import StandardScaler
SS_X = StandardScaler()
X_train = SS_X.fit_transform(X_train)
X_test = SS_X.transform(X_test)

#fitting logistic regression to the training data set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

#predicting the trest set results
y_pred = gnb.predict(X_test)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1 = np.arange(start=X_set[:, 0].min() - 1,
               stop=X_set[:, 0].max() + 1,
               step=0.01)
X2 = np.arange(start=X_set[:, 1].min() - 1,
               stop=X_set[:, 1].max() + 1,
               step=0.01)
X1, X2 = np.meshgrid(X1, X2)
plt.contourf(X1,
######################################### - Fitting Model- ###########################################

# Model 1
# Multinomial Naive Bayes
smnb = MultinomialNB()
smnb.fit(X_train_count,y_train)

## Multinomial Model Accuracy
smnb.score(X_train_count,y_train) # 0.99
smnb.score(X_test_count,y_test)  # 0.98

# Model 2
# Gaussian Naive Bayes
sgnb = GaussianNB()
sgnb.fit(X_train_count_array,y_train)

## Gaussian Model Accuracy
sgnb.score(X_train_count_array,y_train) # 0.90
sgnb.score(X_test_count_array,y_test)  # 0.85

# From Above we can Conclude that Multinomial Naive Bayes Model gives us best result. So we are using it for future Predication.

# Prediction on Train & Test Data
pred_train = smnb.predict(X_train_count)
pred_test = smnb.predict(X_test_count)

# Confusion matrix of Train and Test
## Train
confusion_matrix_train = pd.crosstab(y_train,pred_train,rownames=['Actual'],colnames= ['Train Predictions']) 
sns.heatmap(confusion_matrix_train, annot = True, cmap = 'Blues',fmt='g')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB

dataset = pd.read_csv("../datasheets/projection.csv")

cases = np.array(dataset.india.values.tolist())  # y
days = list(range(1, (len(cases) + 1)))  # X
days = np.array([[day] for day in days])

days_pred = list(range(1, (len(cases) + 5)))
days_pred = np.array([[day] for day in days_pred])

clf = GaussianNB()
clf.fit(days, cases)
print(clf.predict(days_pred))
Example #45
0
"""우리는 Feature 중 sepal에 관련된 두 개의 feature만 이용해서 학습할 것이다. 따라서 이외의 feature는 제거해준다. 그리고 target 값은 현재의 string에서 숫자로 변환해준다. 

그 후 격자 안의 모든 점을 가우시안 나이브 베이즈 모델을 이용하여 예측하고 해당 예측을 통해서 decision boundary를 visualization해준다. 결과는 아래와 같다.
"""

import matplotlib.colors as colors
from sklearn.naive_bayes import GaussianNB
df1 = iris_frame[["sepal length (cm)", "sepal width (cm)", "target"]]
X = df1.iloc[:, 0:2]
Y = df1.iloc[:, 2].replace({
    'setosa': 0,
    'versicolor': 1,
    'virginica': 2
}).copy()
NB = GaussianNB()
NB.fit(X, Y)
N = 100

X_ = np.linspace(4, 8, N)
Y_ = np.linspace(1.5, 5, N)
X_, Y_ = np.meshgrid(X_, Y_)

color_list = ['Blues', 'Greens', 'Reds']
my_norm = colors.Normalize(vmin=-1, vmax=1)
g = sn.FacetGrid(iris_frame, hue="target", size=10, palette='colorblind').map(
    plt.scatter,
    "sepal length (cm)",
    "sepal width (cm)",
).add_legend()

my_ax = g.ax
Example #46
0
print('Empirical learning curve for RF generated')
X, Y = ([] for i in range(2))

test_label = [train_label[i] for i in range(len(test_data))]
original_test_data = np.array(test_data)  # same for every iteration
clf = GaussianNB()
for sample_size in range(1, len(train_label) / CIGTOTAL):
    # train with given sample size
    X.append(sample_size)
    train_subset_label = [
        train_label[i] for i in range(CIGTOTAL * sample_size)
    ]
    train_subset_data = [train_data[i] for i in range(CIGTOTAL * sample_size)]
    train_subset_label = np.array(train_subset_label)
    train_subset_data = np.array(train_subset_data)
    clf.fit(train_subset_data, train_subset_label)

    # test the trained classifier
    predict = clf.predict(original_test_data)
    Y.append(getY(predict, test_label))

fig, ax = plt.subplots(1, figsize=(11, 8))
ax.plot(X, Y)
plt.xticks(np.arange(1, len(train_label) / CIGTOTAL, 1.))
plt.xlabel('sample size')
plt.ylabel('accuracy')
plt.title('Empirical Gaussian NB learning curve for Halo, Juul, Blu, and V2')
fig.savefig('2_ss_lc/nb_lc.png')
plt.show()
Example #47
0
class EndgamePredictor():
    def __init__(self):
        data = pd.read_csv('CheckEndgame.csv')
        data["Pieces"] = data.apply(
            lambda row: self.gettotalpieces(chess.Board(row["FEN"])), axis=1)
        data["Material"] = data.apply(
            lambda row: self.gettotalmaterial(chess.Board(row["FEN"])), axis=1)
        data["Major Pieces"] = data.apply(
            lambda row: self.getmajorpieces(chess.Board(row["FEN"])), axis=1)

        x = data[['Pieces', 'Material', 'Major Pieces']]
        y = data.Endgame

        self.model = GaussianNB()
        self.model.fit(x, y)

    def is_endgame(self, fen: str):
        board = chess.Board(fen)
        arr = np.array([
            self.gettotalpieces(board),
            self.gettotalmaterial(board),
            self.getmajorpieces(board)
        ])
        result = self.model.predict(arr.reshape(-1, 1))
        if (result.any()):
            return True
        else:
            return False

    def gettotalmaterial(self, board: chess.Board):
        i = 0
        valfinder = SquareValue()
        material = 0
        while (i < 64):
            piece = board.piece_at(i)
            if (piece):
                if ((piece.piece_type > 1) and (piece.piece_type < 6)):
                    material += abs(
                        valfinder.getpiecevalue(i, chess.WHITE, piece, True))
            i += 1
        return material

    def gettotalpieces(self, board: chess.Board):
        i = 0
        pieces = 0
        while (i < 64):
            piece = board.piece_at(i)
            if (piece):
                pieces += 1
            i += 1
        return pieces - 2

    def getmajorpieces(self, board: chess.Board):
        i = 0
        pieces = 0
        while (i < 64):
            piece = board.piece_at(i)
            if (piece):
                if ((piece.piece_type > 1) and (piece.piece_type < 6)):
                    pieces += 1
            i += 1
        return pieces
Example #48
0
y_data = y_data.reset_index(drop=True)
print('\nPre-processing Done.')

print('\nCount of different classes in Train set:')
print(X_train['Class'].value_counts())

print('\nCount of different classes in Test set:')
print(X_test['Class'].value_counts())

feats=[c for c in X_train.columns if c!='Class']

# Train classifier
print('\nImplementing Gaussian Naive Bayes Model.')
gnb = GaussianNB()
gnb.fit(
    X_train[feats].values,
    y_train['Class']
)
y_pred = gnb.predict(X_test[feats].values)

print("\nNumber of mislabeled points out of a total {} points : {}, Accuracy: {:05.5f}%"
      .format(
          X_test.shape[0],
          (X_test["Class"] != y_pred).sum(),
          100*(1-(X_test["Class"] != y_pred).sum()/X_test.shape[0])
))


cv = KFold(n_splits=5)
clf = GaussianNB()
X_data=X_data.values
y_data=y_data.values
def result():
    if request.method == 'POST':
        path = request.files.get('myFile')

        df = pd.read_csv(path, encoding="ISO-8859-1")

        filename = request.form['filename']

        str1 = request.form['feature']
        str2 = request.form['label']

        if str1 in list(df) and str2 in list(df):
            y = df[str2]
            X = df[str1]
        else:
            return render_template('nameError.html')

        x = []
        for subject in X:
            result = re.sub(r"http\S+", "", subject)
            replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result)
            x.append(replaced)
        X = pd.Series(x)

        X = X.str.lower()
        """
        texts = []
        for doc in X:
            doc = nlp(doc, disable=['parser', 'ner'])
            tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
            tokens = [tok for tok in tokens if tok not in stopwords]
            tokens = ' '.join(tokens)
            texts.append(tokens)

        X = pd.Series(texts)
        """
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)

        tfidfvect = TfidfVectorizer(ngram_range=(1, 1))
        X_train_tfidf = tfidfvect.fit_transform(X_train)

        start = time()
        clf1 = LinearSVC()
        clf1.fit(X_train_tfidf, y_train)
        pred_SVC = clf1.predict(tfidfvect.transform(X_test))

        a1 = accuracy_score(y_test, pred_SVC)
        end = time()
        print("accuracy SVC: {} and time: {} s".format(a1, (end - start)))

        start = time()
        clf2 = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='newton-cg')
        clf2.fit(X_train_tfidf, y_train)
        pred_LR = clf2.predict(tfidfvect.transform(X_test))
        a2 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LR: {} and time: {}".format(a2, (end - start)))

        start = time()
        clf3 = RandomForestClassifier(n_jobs=-1)

        clf3.fit(X_train_tfidf, y_train)
        pred = clf3.predict(tfidfvect.transform(X_test))
        a3 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RFC: {} and time: {}".format(a3, (end - start)))

        start = time()
        clf4 = MultinomialNB()

        clf4.fit(X_train_tfidf, y_train)
        pred = clf4.predict(tfidfvect.transform(X_test))
        a4 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy MNB: {} and time: {}".format(a4, (end - start)))

        start = time()
        clf5 = GaussianNB()

        clf5.fit(X_train_tfidf.toarray(), y_train)
        pred = clf5.predict(tfidfvect.transform(X_test).toarray())
        a5 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy GNB: {} and time: {}".format(a5, (end - start)))

        start = time()
        clf6 = LogisticRegressionCV(n_jobs=-1)
        clf6.fit(X_train_tfidf, y_train)
        pred_LR = clf6.predict(tfidfvect.transform(X_test))
        a6 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LRCV: {} and time: {}".format(a6, (end - start)))

        start = time()
        clf7 = AdaBoostClassifier()
        clf7.fit(X_train_tfidf, y_train)
        pred_LR = clf7.predict(tfidfvect.transform(X_test))
        a7 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy ABC: {} and time: {}".format(a7, (end - start)))

        start = time()
        clf8 = BernoulliNB()

        clf8.fit(X_train_tfidf.toarray(), y_train)
        pred = clf8.predict(tfidfvect.transform(X_test).toarray())
        a8 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy BNB: {} and time: {}".format(a8, (end - start)))

        start = time()
        clf9 = Perceptron(n_jobs=-1)

        clf9.fit(X_train_tfidf.toarray(), y_train)
        pred = clf9.predict(tfidfvect.transform(X_test).toarray())
        a9 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy Per: {} and time: {}".format(a9, (end - start)))
        start = time()
        clf10 = RidgeClassifierCV()

        clf10.fit(X_train_tfidf.toarray(), y_train)
        pred = clf10.predict(tfidfvect.transform(X_test).toarray())
        a10 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RidCV: {} and time: {}".format(a10, (end - start)))

        start = time()
        clf11 = SGDClassifier(n_jobs=-1)

        clf11.fit(X_train_tfidf.toarray(), y_train)
        pred = clf11.predict(tfidfvect.transform(X_test).toarray())
        a11 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy SGDC: {} and time: {}".format(a11, (end - start)))
        start = time()
        clf12 = SGDClassifier(n_jobs=-1)

        clf12.fit(X_train_tfidf.toarray(), y_train)
        pred = clf12.predict(tfidfvect.transform(X_test).toarray())
        a12 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy XGBC: {} and time: {}".format(a12, (end - start)))

        acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12]
        max_list = max(acu_list)

        if max_list == a1:
            pickle.dump(clf1, open(filename + '_model', 'wb'))
        elif max_list == a2:
            pickle.dump(clf2, open(filename + '_model', 'wb'))
        elif max_list == a3:
            pickle.dump(clf3, open(filename + '_model', 'wb'))
        elif max_list == a4:
            pickle.dump(clf4, open(filename + '_model', 'wb'))
        elif max_list == a5:
            pickle.dump(clf5, open(filename + '_model', 'wb'))
        elif max_list == a6:
            pickle.dump(clf6, open(filename + '_model', 'wb'))
        elif max_list == a7:
            pickle.dump(clf7, open(filename + '_model', 'wb'))
        elif max_list == a8:
            pickle.dump(clf8, open(filename + '_model', 'wb'))
        elif max_list == a9:
            pickle.dump(clf9, open(filename + '_model', 'wb'))
        elif max_list == a10:
            pickle.dump(clf10, open(filename + '_model', 'wb'))
        elif max_list == a11:
            pickle.dump(clf11, open(filename + '_model', 'wb'))
        elif max_list == a12:
            pickle.dump(clf12, open(filename + '_model', 'wb'))

        pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb'))

        return render_template("result.html", ac1=a1, ac2=a2, ac3=a3, ac4=a4, ac5=a5, ac6=a6, ac7=a7, ac8=a8, ac9=a9,
                               ac10=a10, ac11=a11, ac12=a12)
X_train = np.empty(shape=(len(Li), len(feature_dict) + 3))
Y_train = np.empty(shape=len(Li))

for i in range(len(Li)):
    #print(Li[i])
    List = tokenize(Li[i][0])
    #    score_snippet(List,dal)
    X_train[i] = get_features(List)
    Y_train[i] = Li[i][1]
# for i in range(len(Li)):
#     #print(Li[i])
#     print(X_train[i],":",Y_train[i])

normalized_X = (normalize(X_train))
clf = GaussianNB()
clf.fit(normalized_X, Y_train)
clf_lr = LogisticRegression()
clf_lr.fit(normalized_X, Y_train)

Li_test = load_corpus("/Users/sravyakurra/Desktop/NLP/HW@2/test.txt")
X_test = np.empty(shape=(len(Li_test), len(feature_dict) + 3))
Y_test = np.empty(shape=len(Li_test))

for i in range(len(Li_test)):
    #print(Li[i])
    List = tokenize(Li_test[i][0])

    X_test[i] = get_features(List)
    Y_test[i] = Li_test[i][1]
#print(X_test)
#print("hiii")
class Model_Finder:
    """
                This class shall  be used to find the model with best accuracy and AUC score.
                """
    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object
        self.gnb = GaussianNB()
        self.xgb = XGBClassifier(objective='binary:logistic', n_jobs=-1)

    def get_best_params_for_naive_bayes(self, train_x, train_y):
        """
        Method Name: get_best_params_for_naive_bayes
        Description: get the parameters for the Naive Bayes's Algorithm which give the best accuracy.
                     Use Hyper Parameter Tuning.
        Output: The model with the best parameters
        On Failure: Raise Exception
                        """
        self.logger_object.log(
            self.file_object,
            'Entered the get_best_params_for_naive_bayes method of the Model_Finder class'
        )
        try:
            # initializing with different combination of parameters
            self.param_grid = {
                "var_smoothing": [
                    1e-9, 0.1, 0.001, 0.5, 0.05, 0.01, 1e-8, 1e-7, 1e-6, 1e-10,
                    1e-11
                ]
            }

            #Creating an object of the Grid Search class
            self.grid = GridSearchCV(estimator=self.gnb,
                                     param_grid=self.param_grid,
                                     cv=3,
                                     verbose=3)
            #finding the best parameters
            self.grid.fit(train_x, train_y)

            #extracting the best parameters
            self.var_smoothing = self.grid.best_params_['var_smoothing']

            #creating a new model with the best parameters
            self.gnb = GaussianNB(var_smoothing=self.var_smoothing)
            # training the mew model
            self.gnb.fit(train_x, train_y)
            self.logger_object.log(
                self.file_object,
                'Naive Bayes best params: ' + str(self.grid.best_params_) +
                '. Exited the get_best_params_for_naive_bayes method of the Model_Finder class'
            )

            return self.gnb
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_best_params_for_naive_bayes method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Naive Bayes Parameter tuning  failed. Exited the get_best_params_for_naive_bayes method of the Model_Finder class'
            )
            raise Exception()

    def get_best_params_for_xgboost(self, train_x, train_y):
        """
                                        Method Name: get_best_params_for_xgboost
                                        Description: get the parameters for XGBoost Algorithm which give the best accuracy.
                                                     Use Hyper Parameter Tuning.
                                        Output: The model with the best parameters
                                        On Failure: Raise Exception
                                """
        self.logger_object.log(
            self.file_object,
            'Entered the get_best_params_for_xgboost method of the Model_Finder class'
        )
        try:
            # initializing with different combination of parameters
            self.param_grid_xgboost = {
                "n_estimators": [50, 100, 130],
                "max_depth": range(3, 11, 1),
                "random_state": [0, 50, 100]
            }
            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(
                XGBClassifier(objective='binary:logistic'),
                self.param_grid_xgboost,
                verbose=3,
                cv=2,
                n_jobs=-1)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.random_state = self.grid.best_params_['random_state']
            self.max_depth = self.grid.best_params_['max_depth']
            self.n_estimators = self.grid.best_params_['n_estimators']

            # creating a new model with the best parameters
            self.xgb = XGBClassifier(random_state=self.random_state,
                                     max_depth=self.max_depth,
                                     n_estimators=self.n_estimators,
                                     n_jobs=-1)
            # training the mew model
            self.xgb.fit(train_x, train_y)
            self.logger_object.log(
                self.file_object,
                'XGBoost best params: ' + str(self.grid.best_params_) +
                '. Exited the get_best_params_for_xgboost method of the Model_Finder class'
            )
            return self.xgb
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_best_params_for_xgboost method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'XGBoost Parameter tuning  failed. Exited the get_best_params_for_xgboost method of the Model_Finder class'
            )
            raise Exception()

    def get_best_model(self, train_x, train_y, test_x, test_y):
        """
                                                Method Name: get_best_model
                                                Description: Find out the Model which has the best AUC score.
                                                Output: The best model name and the model object
                                                On Failure: Raise Exception
                                        """
        self.logger_object.log(
            self.file_object,
            'Entered the get_best_model method of the Model_Finder class')
        # create best model for XGBoost
        try:
            self.xgboost = self.get_best_params_for_xgboost(train_x, train_y)
            self.prediction_xgboost = self.xgboost.predict(
                test_x)  # Predictions using the XGBoost Model

            if len(
                    test_y.unique()
            ) == 1:  #if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
                self.xgboost_score = accuracy_score(test_y,
                                                    self.prediction_xgboost)
                self.logger_object.log(self.file_object,
                                       'Accuracy for XGBoost:' +
                                       str(self.xgboost_score))  # Log AUC
            else:
                self.xgboost_score = roc_auc_score(
                    test_y, self.prediction_xgboost)  # AUC for XGBoost
                self.logger_object.log(self.file_object, 'AUC for XGBoost:' +
                                       str(self.xgboost_score))  # Log AUC

            # create best model for Random Forest
            self.naive_bayes = self.get_best_params_for_naive_bayes(
                train_x, train_y)
            self.prediction_naive_bayes = self.naive_bayes.predict(
                test_x)  # prediction using the Random Forest Algorithm

            if len(
                    test_y.unique()
            ) == 1:  #if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
                self.naive_bayes_score = accuracy_score(
                    test_y, self.prediction_naive_bayes)
                self.logger_object.log(
                    self.file_object,
                    'Accuracy for NB:' + str(self.naive_bayes_score))
            else:
                self.naive_bayes_score = roc_auc_score(
                    test_y,
                    self.prediction_naive_bayes)  # AUC for Random Forest
                self.logger_object.log(
                    self.file_object,
                    'AUC for RF:' + str(self.naive_bayes_score))

            #comparing the two models
            if (self.naive_bayes_score < self.xgboost_score):
                return 'XGBoost', self.xgboost
            else:
                return 'NaiveBayes', self.naive_bayes

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_best_model method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Model Selection Failed. Exited the get_best_model method of the Model_Finder class'
            )
            raise Exception()
Example #52
0
class NaiveBayes:
    def __init__(self,
                 features={},
                 split=0.8,
                 distribution="Bernoulli",
                 isSummary=False):
        self.Tags = [
            "OTH", "BKG", "CTR", "NA", "AIM", "OWN", "BAS", "TXT", "", "BEGIN"
        ]
        self.Locations = [
            "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
            "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"
        ]
        self.ParaLocations = ["INITIAL", "MEDIAL", "FINAL"]
        self.Headlines = [
            "Introduction", "Implementation", "Example", "Conclusion",
            "Result", "Evaluation", "Solution", "Discussion", "Further Work",
            "Data", "Related Work", "Experiment", "Problems", "Method",
            "Problem Statement", "Non-Prototypical"
        ]
        self.YESorNO = ["YES", "NO"]
        self.SecLocations = [
            "FIRST", "SECOND", "THIRD", "LAST", "SECOND-LAST", "THIRD-LAST",
            "SOMEWHERE"
        ]
        self.Tenses = ["PRESENT", "PAST", "FUTURE", "NOVERB"]
        self.Modals = ["MODAL", "NOMODAL", "NOVERB"]
        self.Voices = ["Active", "Passive", "NOVERB"]
        self.isSummary = isSummary
        self.features = features
        self.transformFeatures()
        self.distribution = distribution
        self.split = split
        self.splitData()

    def reloadDis(self):
        if self.distribution == "Bernoulli":
            self.nb = BernoulliNB()
        elif self.distribution == "Multinomial":
            self.nb = MultinomialNB()
        elif self.distribution == "Complement":
            self.nb = ComplementNB()
        else:
            self.nb = GaussianNB()

    def splitData(self):
        if not self.isSummary:
            print "Data split between train and test: " + str(self.split)
        papers = self.features.keys()
        order = np.random.permutation(len(papers))

        self.train_papers = []
        for i in range(int(self.split * len(papers))):
            self.train_papers.append(papers[order[i]])

        self.test_papers = []
        for i in range(int(self.split * len(papers)) + 1, len(papers)):
            self.test_papers.append(papers[order[i]])

        self.train_X, self.train_y = self.getFeatures(self.train_papers)
        self.test_X, self.test_y = self.getFeatures(self.test_papers)

    def transformFeatures(self):
        self.transformed_features = dict()
        for filename in self.features.keys():
            self.transformed_features[filename] = dict()
            for sentId in self.features[filename].keys():
                self.transformed_features[filename][sentId] = dict()
                self.transformed_features[filename][sentId][
                    'loc'] = self.Locations.index(
                        self.features[filename][sentId]['loc'])
                self.transformed_features[filename][sentId][
                    'parloc'] = self.ParaLocations.index(
                        self.features[filename][sentId]['parloc'])
                self.transformed_features[filename][sentId][
                    'val'] = self.Tags.index(
                        self.features[filename][sentId]['val'])
                self.transformed_features[filename][sentId][
                    'Title'] = self.YESorNO.index(
                        self.features[filename][sentId]['Title'])
                self.transformed_features[filename][sentId][
                    'len'] = self.YESorNO.index(
                        self.features[filename][sentId]['len'])
                self.transformed_features[filename][sentId][
                    'tfidf'] = self.YESorNO.index(
                        self.features[filename][sentId]['tfidf'])
                self.transformed_features[filename][sentId][
                    'secloc'] = self.SecLocations.index(
                        self.features[filename][sentId]['secloc'])
                self.transformed_features[filename][sentId][
                    'Headlines'] = self.Headlines.index(
                        self.features[filename][sentId]['Headlines'])
                self.transformed_features[filename][sentId][
                    'history'] = self.Tags.index(
                        self.features[filename][sentId]['history'])
                self.transformed_features[filename][sentId][
                    'tense'] = self.Tenses.index(
                        self.features[filename][sentId]['tense'])
                self.transformed_features[filename][sentId][
                    'voice'] = self.Voices.index(
                        self.features[filename][sentId]['voice'])
                self.transformed_features[filename][sentId][
                    'modal'] = self.Modals.index(
                        self.features[filename][sentId]['modal'])

    def getFeatures(self, filenames):
        X = []
        y = []
        for filename in filenames:
            for sentId in self.transformed_features[filename].keys():
                X.append(self.transformed_features[filename][sentId].values())
                y.append(self.transformed_features[filename][sentId]['val'])
        X = np.asarray(X)
        y = np.asarray(y)
        return X, y

    def getSummary(self, filename):
        summary = []
        for sentId in self.transformed_features[filename].keys():
            feature = self.transformed_features[filename][sentId].values()
            y = self.nb.predict([feature])
            if y in [1, 2, 4, 6]:
                summary.append(self.features[filename][sentId]['data'])
            if y in [0, 1, 5]:
                if random.uniform(0, 1) > 0.96:
                    summary.append(self.features[filename][sentId]['data'])
        return "\n".join(summary)

    def train(self):
        if not self.isSummary:
            print "Train dataset: ", len(self.train_papers)
        self.reloadDis()
        y_pred = self.nb.fit(self.train_X, self.train_y).predict(self.train_X)
        if not self.isSummary:
            print "Mislabelled sentences: " + str(
                (self.train_y != y_pred).sum()) + " out of " + str(
                    self.train_X.shape[0])
            print "Train Accuracy: " + str(
                self.accuracy(
                    (self.train_y != y_pred).sum(), self.train_X.shape[0]))

    def test(self, generate_histogram=False):
        print "Test dataset length: ", len(self.test_papers)
        y_pred = self.nb.predict(self.test_X)
        if generate_histogram:
            plt.hist(y_pred, density=True)
            plt.savefig('histogram.png')
        print "Mislabelled sentences: " + str(
            (self.test_y != y_pred).sum()) + " out of " + str(
                self.test_X.shape[0])
        print "Test Accuracy: " + str(
            self.accuracy((self.test_y != y_pred).sum(), self.test_X.shape[0]))
        # return self.getConfusionMatrix(self.test_y, y_pred)

    def accuracy(self, misclassifications, samples):
        return (1 - (misclassifications / (samples * 1.0))) * 100.0

    def plotConfusionMatrix(self,
                            cm,
                            classes,
                            normalize=True,
                            title='Confusion matrix',
                            cmap=plt.cm.Blues):
        """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
        """
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')

        print(cm)

        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j,
                     i,
                     format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()
        plt.savefig('confusion_matrix.png')

    def getConfusionMatrix(self, y_true, y_pred):
        return confusion_matrix(y_true, y_pred)
Example #53
0
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('diabetes.csv')

x = df.drop('diabetes' ,axis=1)
y = df['diabetes']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=42)

model = GaussianNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_pred

accuracy = accuracy_score(y_test, y_pred)*100
accuracy

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

print(x_train)

print(x_test)

print(y_train)
Example #54
0
import matplotlib.pyplot as plt
import pandas as pd
dt = pd.read_csv('Data.csv')
print(dt)
X = dt.iloc[:, 1:-1].values
y = dt.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.naive_bayes import GaussianNB
reg = GaussianNB()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print(
    np.concatenate(
        (y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
X = my_imputer.fit_transform(X)
X_test = my_imputer.fit_transform(X_test)

model1 = GaussianNB()
model1.fit(X, y)
model2 = RandomForestClassifier(max_depth=15,
                                n_estimators=100,
                                bootstrap=False,
                                max_features='sqrt',
                                min_samples_leaf=4,
                                min_samples_split=10)
model2.fit(X, y)
model3 = MLPClassifier(solver='lbfgs',
                       alpha=1e-5,
                       hidden_layer_sizes=(5, 2),
                       random_state=1,
                       max_iter=2000)
model3.fit(X, y)
model4 = KNeighborsClassifier(3)
model4.fit(X, y)
plt.show()
"""

#Split the data
X, x, Y, y = train_test_split(features,
                              targets,
                              test_size=0.2,
                              random_state=10)

#Let us try different algorithms to find the best match
#1. Naive-Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
#Create a GaussianNB object
gnb = GaussianNB()
pred = gnb.fit(X, Y).predict(x)
print("Naive-Bayes accuracy: ", accuracy_score(y, pred, normalize=True))

#2. Linear Support Vector Classifier
from sklearn.svm import LinearSVC
svc_model = LinearSVC(random_state=0)
pred = svc_model.fit(X, Y).predict(x)
print("Linear SVC accuracy: ", accuracy_score(y, pred, normalize=True))

#3. k-Nearest-Neighbours classifier
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, Y)
pred = neigh.predict(x)
print("k-Nearest-Neighbours score: ", accuracy_score(y, pred))
# Read the small handwritten digit dataset
digitsData = pd.read_csv('digits_small.csv')
y_digits = digitsData['0']
X_digits = digitsData.drop('0', axis=1)

# Split the data to train and test dataset with 20%
Xtrain, Xtest, ytrain, ytest = train_test_split(X_digits,
                                                y_digits,
                                                random_state=0,
                                                test_size=0.2)

# Choose Gaussian Naive Bayes model
model = GaussianNB()

# Fit the model with the iris dataset
model.fit(Xtrain, ytrain)

# Evaluate the outcome for Xtest
y_fitted = model.predict(Xtest)

# Print the accuracy. It is 0.825.
print("The accuracy of GaussianNB is %f" % (accuracy_score(ytest, y_fitted)))
confusionMat = confusion_matrix(ytest, y_fitted)
sns.heatmap(confusionMat, cbar=False, square=True, annot=True)
plt.xlabel('predicted digits')
plt.ylabel('true digits')

# Evaluate five-fold cross validation scores
cv_score = cross_val_score(model, X_digits, y_digits, cv=5)
print("Cross Validation Scores:", cv_score)
class Emoji(object):

    def __init__(self):
        # fit the Naive Bayes
        np.random.seed(42)
        self.emojis = pd.read_pickle('../database/df_emojis.pkl')

    def fit(self):

        # ------- this part needs work
        try:
            self.labeled_tweets = pd.read_pickle('../database/labeled.pkl')
            print 'it worked'
        except:
            from label_tweets import label_tweets
            tweets = np.array(list(pickle.load(open('../database/yay_moji.pkl','rb'))))
            self.by_emoji,self.labeled_tweets = label_tweets(tweets,self.emojis,top = 50, save = True)

        self.y = self.labeled_tweets['emoji'].values
        self.X = self.labeled_tweets['tweet'].values


    def model(self, max_df_ = .8, min_df_ = .001, ngram = (1,2)):

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X,self.y)

        stopwords = set(list(ENGLISH_STOP_WORDS) + ['rt', 'follow', 'dm', 'https', 'ur', 'll' ,'amp', 'subscribe', 'don', 've', 'retweet', 'im', 'http','lt'])

        # fit the tfidf or CountVectorizer
        self.tfidf = TfidfVectorizer(max_features=10000, max_df = max_df_, min_df=min_df_, stop_words = stopwords, ngram_range = ngram)

        self.tfidf.fit(self.X_train)
        self.vector = self.tfidf.transform(self.X_train)

        # --> add the emoji name to bag of words for each emoji
        self.bag = np.array(self.tfidf.get_feature_names())

        self.nb = GaussianNB()
        self.nb.fit(self.vector.todense(), self.y_train)

    def internal_predict(self, print_side_by_side = True):
        test_tfidf = self.tfidf.transform(self.X_test)
        predicted = self.nb.predict(test_tfidf.todense())
        print 'labeled'
        acc = np.mean(self.y_test == predicted)

        print 'Test accuracy =',acc
        print ''

        if print_side_by_side:
            for true,predict in zip(self.y_test,predicted):
                print '-->',true,predict


    def predict(self,text):
        top_n = 3
        test_tfidf = self.tfidf.transform([text])
        probs = self.nb.predict_proba(test_tfidf.todense())
        probs = probs.flatten()
        above_0 = np.argwhere(probs>0).flatten()
        above_0 = np.sort(above_0)[::-1]
        print '-->',text,'=',
        for i in above_0[:5]:
            print self.nb.classes_[i],' ',#probs.flatten()[i],' ',
        print ''

        return probs

    def print_top_words(self,top_n_words=5):
        # printing top words for each emoji
        print ''
        print '----- Top {} words for each Emoji in Train set'.format(top_n_words)
        print '-'*60
        for i in range(len(self.nb.classes_)):
            top =  self.bag[self.nb.theta_[i].argsort()[::-1]][:top_n_words]
            print self.nb.classes_[i],' -->',top
        print ''
def naive_bayes(training_file, test_file):

    start = time.time()
    #-----------------------------------DATA PREPARATION-----------------------------------
    training_set = pd.read_csv(training_file, header=None)
    test_set = pd.read_csv(test_file, header=None)

    #encoding the training set
    categorical_feature_mask = training_set.dtypes == object
    categorical_cols = training_set.columns[categorical_feature_mask].tolist()

    le = LabelEncoder()
    training_set[categorical_cols] = training_set[categorical_cols].apply(
        lambda col: le.fit_transform(col))

    #encoding the test set
    categorical_feature_mask = test_set.dtypes == object
    categorical_cols = test_set.columns[categorical_feature_mask].tolist()

    le = LabelEncoder()
    test_set[categorical_cols] = test_set[categorical_cols].apply(
        lambda col: le.fit_transform(col))

    l = (len(training_set.columns) - 1)

    x = training_set.drop([l], axis=1)

    y = training_set[l]

    l = (len(training_set.columns) - 1)

    x_test = test_set.drop([l], axis=1)

    y_test = test_set[l]

    #-----------------------------------MODEL GENERATION AND PREDICTION-----------------------------------
    # Decision tree
    gb = GaussianNB()
    # Performing training
    gb.fit(x, y)

    #prediciton of test set class attribute
    pred = gb.predict(x_test)

    #-----------------------------------COMPARISON AND OUTPUT-----------------------------------
    #ytestar = y_test.to_numpy()

    true_values = []
    y_test_rows = y_test.shape[0]

    for i in range(0, (y_test_rows)):
        if np.array_equal(y_test[i], pred[i]):
            true_values.append(1)
        else:
            true_values.append(0)

    for i in range(0, y_test_rows):
        print("ID = " + str(i) + " predicted = " + str(pred[i]) + " true = " +
              str(y_test[i]) + " accuracy = " + str(true_values[i]))

    print("\nClassification report:\n" + classification_report(y_test, pred))

    print("Runtime: ")
    print(time.time() - start)
train_y = np.mat(df_train.iloc[select_idx]['label'].tolist()).reshape((-1, 1))

##构建测试集矩阵
test_x = makeDataMat(df_train.iloc[test_select_idx], vocabList)
test_y = np.mat(df_train.iloc[test_select_idx]['label'].tolist()).reshape(
    (-1, 1))

# In[12]:

train_x.shape, train_y.shape

# In[13]:

#模型训练
model = GaussianNB()
model.fit(train_x, train_y)

# In[14]:

#二分类参数
model.class_prior_

# In[15]:

sum(train_y)

# In[16]:

model.predict(
    np.array(word2Vect('这个真的好可爱啊!我超喜欢这里的', vocabList)).reshape(1, -1))