def decisionTree(training_list, testing_list, fileTestBelongCategory, words_name, use_version2=True): print "-----------------------------------------------------------------------------------------" print "\nDecision Tree Algorithm\n" if use_version2: # adjust_depth_dict = {} # for max_depth in range(10, 121, 10): # DT.decisionTree_version2(training_list, testing_list, max_depth=max_depth, adjust_depth_dict=adjust_depth_dict) # save_obj(adjust_depth_dict, 'adjust_depth') DT.decisionTree_version2(training_list, testing_list, fileTestBelongCategory) else: DT.decisionTree_version1(training_list, testing_list, words_name, num_trainning_file=200, num_features=1000) # num_trainning_file=len(training_list), num_features=len(training_list[0]) - 1
def test_gini(): array = [1, 1, 2, 1, 2] result = DT()._gini(np.array(array)) actual = 0.48 message = 'Gini value for {}: Got {:.2f}. Should be {:.2f}'.format( array, result, actual) n.assert_almost_equal(result, actual, 4, message)
def test_information_gain(): X, y, X1, y1, X2, y2 = fake_data() result = DT()._information_gain(y, y1, y2) actual = 0.019973 message = 'Information gain for:\n{}, {}, {}:\nGot {:.3f}. Should be {:.3f}'.format( y, y1, y2, result, actual) n.assert_almost_equal(result, actual, 4, message)
def test_entropy(): array = [1, 1, 2, 1, 2] result = DT()._entropy(np.array(array)) actual = 0.97095 message = 'Entropy value for {}: Got {:.2f}. Should be {:.2f}'.format( array, result, actual) n.assert_almost_equal(result, actual, 4, message)
def test_entropy(): array = [1, 1, 2, 1, 2] result = DT()._entropy(np.array(array)) actual = 0.67301 message = 'Entropy value for %r: Got %.2f. Should be %.2f' \ % (array, result, actual) n.assert_almost_equal(result, actual, 4, message)
def test_information_gain(): X, y, X1, y1, X2, y2 = fake_data() result = DT()._information_gain(y, y1, y2) actual = 0.01384 message = 'Information gain for:\n%r, %r, %r:\nGot %.3f. Should be %.3f' \ % (y, y1, y2, result, actual) n.assert_almost_equal(result, actual, 4, message)
def test_gini(): array = [1, 1, 2, 1, 2] result = DT()._gini(np.array(array)) actual = 0.48 message = 'Gini value for %r: Got %.2f. Should be %.2f' \ % (array, result, actual) n.assert_almost_equal(result, actual, 4, message)
def create_CategoryAUC(categoryList): # {'acq':['1', '2'], 'cad':['3', '4'] ...} categoryAssigFileTFIDF = {} for cat in categoryList: categoryAssigFileTFIDF[cat] = DT.getTopCategory(cat, len(fileTestAlphaNumericStrStemmedDict.keys())) return categoryAssigFileTFIDF
def create_CategoryAUC(categoryList): # {'acq':['1', '2'], 'cad':['3', '4'] ...} categoryAssigFileTFIDF = {} for cat in categoryList: categoryAssigFileTFIDF[cat] = DT.getTopCategory( cat, len(fileTestAlphaNumericStrStemmedDict.keys())) return categoryAssigFileTFIDF
def test_make_split(): X, y, X1, y1, X2, y2 = fake_data() split_index, split_value = 1, 'bat' dt = DT() dt.categorical = np.array([False, True]) result = dt._make_split(X, y, split_index, split_value) try: X1_result, y1_result, X2_result, y2_result = result except ValueError: n.assert_true(False, 'result not in correct form: (X1, y1, X2, y2)') actual = (X1, y1, X2, y2) message = '_make_split got results\n%r\nShould be\n%r' % (result, actual) n.ok_(np.array_equal(X1, X1_result), message) n.ok_(np.array_equal(y1, y1_result), message) n.ok_(np.array_equal(X2, X2_result), message) n.ok_(np.array_equal(y2, y2_result), message)
def test_choose_split_index(): X, y, X1, y1, X2, y2 = fake_data() index, value = 1, 'cat' dt = DT() dt.categorical = np.array([False, True]) result = dt._choose_split_index(X, y) try: split_index, split_value, splits = result except ValueError: message = 'result not in correct form. Should be:\n' \ ' split_index, split_value, splits' n.assert_true(False, message) message = 'choose split for data:\n%r\n%r\n' \ 'split index, split value should be: %r, %r\n' \ 'not: %r, %r' \ % (X, y, index, value, split_index, split_value) n.eq_(split_index, index, message) n.eq_(split_value, value, message)
def test_choose_split_index(): X, y, X1, y1, X2, y2 = fake_data() index, value = 1, 'cat' dt = DT() dt.categorical = np.array([False, True]) result = dt._choose_split_index(X, y) try: split_index, split_value, splits = result except ValueError: message = ('result not in correct form. Should be:\n' ' split_index, split_value, splits') n.assert_true(False, message) message = ( 'choose split for data:\n{}\n{}\n'.format(X, y) + 'split index, split value should be: {}, {}\n'.format(index, value) + 'not: {}, {}'.format(split_index, split_value)) n.eq_(split_index, index, message) n.eq_(split_value, value, message)
def fit(self, X, y, maxIter=3): y = y.reshape((-1, 1)) ffit = np.zeros(y.shape) for i in range(maxIter): r = y - ffit # 残差 # 将X和残差拼接,送到决策树算法中 alg = DT() alg.BuildTree(np.concatenate((X, r), axis=1)) # 在做DecisionTree的时候,有一个阈值,所以我们的算法甚至可能不运行 if alg.T is None: break self.ModelArr.append(alg) # 更新ffit ffit = ffit + alg.predict(X).reshape((-1, 1)) return self.ModelArr
def play_turn(self, board): """Execute all of the actions on the board for this turn, then end the turn and give the game back to the human.""" while True: loop.refresh(board) time.sleep(0.3) # Check if we have lethal. #lethalActions = check_for_lethal(board) #if lethalActions is not None: # lethalActions[0].perform(board) # continue tree = DT(board, strategy=self.strategy) bestAction = tree.bestAction del tree if bestAction is None: # No more available actions. break if isinstance(bestAction, Action.DoNothingAction): bestAction.perform(board) break bestAction.perform(board) loop.end_turn(board)
from DecisionTree import DT from sklearn import tree import numpy as np if __name__ == '__main__': DT_dataSet, featurn_labels = DT.createDataSet() samples = [example[:-1] for example in DT_dataSet] class_lables = [example[-1] for example in DT_dataSet] classify = tree.DecisionTreeClassifier() classify.fit(samples, class_lables) tv = [1, 0] r = classify.predict([tv]) # r = classify.predict(np.array(tv).reshape(1, -1)) print(r)
def naiveBayes(list): print "\nNaive Bayes Algorithm\n" # Define Naive Bayes algorithm in detail def naiveBayesDetail(list): print "\nNaive Bayes Algorithm\n" # Execute TF-IDF based Cosine Similarity algorithm # tfidfCosineSimilarity(termFrequencyPerCategoryList) # Execute Decision Tree algorithm # decisionTree(frequencyInFilePerCategoryInTrainingSetList, frequencyInFilePerCategoryInTestSetList, fileTestBelongCategory, wholeVocabularyFromTrainingAndTestSetList) # clf = DT.create_decision_tree(frequencyInFilePerCategoryInTrainingSetList, max_depth=80) clf = DT.create_decision_tree(frequencyInFilePerCategoryInTrainingSetList) top_k_categories = DT.get_top_k_prediction_class(clf, frequencyInFilePerCategoryInTestSetList[0], k=1) # print top_k_categories # print len(frequencyInFilePerCategoryInTestSetList) # Confusion Matrix - row : true test category, column - true column category realCategorySize = len(categoryAlphaNumericStrStemmedDict.keys()) confusionMatrix = np.zeros((realCategorySize, realCategorySize), dtype=np.int) categoryTestToIndexDict = {} idx = 0 for key in categoryAlphaNumericStrStemmedDict.keys(): categoryTestToIndexDict[key] = idx idx += 1 confusionTable = np.zeros((2,2), dtype=np.int)
print "\nNaive Bayes Algorithm\n" # Define Naive Bayes algorithm in detail def naiveBayesDetail(list): print "\nNaive Bayes Algorithm\n" # Execute TF-IDF based Cosine Similarity algorithm # tfidfCosineSimilarity(termFrequencyPerCategoryList) # Execute Decision Tree algorithm # decisionTree(frequencyInFilePerCategoryInTrainingSetList, frequencyInFilePerCategoryInTestSetList, fileTestBelongCategory, wholeVocabularyFromTrainingAndTestSetList) # clf = DT.create_decision_tree(frequencyInFilePerCategoryInTrainingSetList, max_depth=80) clf = DT.create_decision_tree(frequencyInFilePerCategoryInTrainingSetList) top_k_categories = DT.get_top_k_prediction_class( clf, frequencyInFilePerCategoryInTestSetList[0], k=1) # print top_k_categories # print len(frequencyInFilePerCategoryInTestSetList) # Confusion Matrix - row : true test category, column - true column category realCategorySize = len(categoryAlphaNumericStrStemmedDict.keys()) confusionMatrix = np.zeros((realCategorySize, realCategorySize), dtype=np.int) categoryTestToIndexDict = {} idx = 0 for key in categoryAlphaNumericStrStemmedDict.keys(): categoryTestToIndexDict[key] = idx idx += 1
def main(): #SVM: t = train() #im = image.ProcessImages('train') #X, Y = im.make_all() #t.train(X, Y) #rbf: print('SVM clf:') t.load(name='clf_rbf.pkl') print(t.predict(measure.process('test3.png'))) print(t.test()) #linear: print('SVM linear:') t.load(name='clf_linear.pkl') print(t.predict(measure.process('test3.png'))) print(t.test()) #k szomszed: #osszes feature-ra #getBestKValue(400) print('osszes feature k neighbours:') im = image.ProcessImages('train') X, Y = im.make_all() #train halmazon elert eredmeny: #maximum: 0.7032590051457976 neighbours: 183 t2 = train_neigh(183) t2.train(X, Y) #teszt halmazon elert eredmeny: #elert eredmeny: 0.5483870967741935 print('elert eredmeny: ', t2.test()) #dontesi fa: print('dontesi fa: ') tree = DT() tree.train(X, Y) print('elert eredmeny: ', tree.test()) print(tree.valid()) tree.show(X, Y) #ecc-extent: print('ket feature: ecc, extent:') #getBestKValue(100, two_feature=True, features=['ecc', 'extent']) t3 = train_neigh(80, first_two=True, features=['ecc', 'extent']) t3.train(X, Y) print('elert eredmeny: ', t3.test()) #elert eredmeny: 0.4946236559139785 t3.train_first_two_features(X, Y) #_train_neigh(80, feauters=['ecc', 'extent']) #x0-y0: print('ket feature: x0, y0:') #getBestKValue(100, two_feature=True, features=['x0', 'y0']) t4 = train_neigh(54, first_two=True, features=['x0', 'y0']) t4.train(X, Y) print('elert eredmeny: ', t4.test()) #elert eredmeny: 0.3333333333333333 t4.train_first_two_features(X, Y) _train_neigh(54, feauters=['x0', 'y0']) #train_neigh_() im2 = image.ProcessImages('test') _train_neigh(80, feauters=['ecc', 'extent'])