# generate evaluation metrics
print("Train - Accuracy :",
      metrics.accuracy_score(y_train, clf.predict(X_train)))
print("Train - Confusion matrix :",
      metrics.confusion_matrix(y_train, clf.predict(X_train)))
print("Train - classification report :",
      metrics.classification_report(y_train, clf.predict(X_train)))

print("Test - Accuracy :", metrics.accuracy_score(y_test, clf.predict(X_test)))
print("Test - Confusion matrix :",
      metrics.confusion_matrix(y_test, clf.predict(X_test)))
print("Test - classification report :",
      metrics.classification_report(y_test, clf.predict(X_test)))

tree.export_graphviz(clf, out_file='tree.dot')
from sklearn.externals.six import StringIO
import pydot
out_data = StringIO()
tree.export_graphviz(
    clf,
    out_file=out_data,
    feature_names=iris.feature_names,
    class_names=clf.classes_.astype(int).astype(str),
    filled=True,
    rounded=True,
    special_characters=True,
    node_ids=1,
)
graph = pydot.graph_from_dot_data(out_data.getvalue())
graph[0].write_pdf("iris.pdf")  # save to pdf0
Esempio n. 2
0
    def buildTree(self):

        k = 0

        #Until now I considered as threshold for test reward:
        # 1- Without considering FER: TEST_REWARD = 76.0
        # 2 - Without considering SER: TEST_REWARD = 76.0
        # 4 - Without considering ES: TEST_REWARD = 76.0
        TEST_REWARD = 76.0

        # I want to know the average reward of all features (in this case three features) after 25 run
        avgRewAllFeatures = []
        # I want to know the average error of all features (in this case three features) after 25 run
        total_error = np.array([])

        #  confidenceIntervalFeatures is a dictionary containing the Monte Carlo error considered for the self.num_trees_optimal policy trees for each run of the algorithm without considering the i-th feature (in this case the i-th key)
        confidenceIntervalFeatures = dict()
        for elem in self.features:
            confidenceIntervalFeatures[elem] = np.array([])

        # Average reward after self.n_runs for trees without a feature
        averageRewardFeatures = dict()
        for elem in self.features:
            averageRewardFeatures[elem] = np.array([])

        total_importance = [
            0.0, 0.0, 0.0
        ]  # it considers the importance of all runs of the algorithm

        while (k < self.num_run):
            print("RUN " + str(k))

            # first tree based on the randomly generated buffer
            self.initializeBuffer()

            # Fit regression model
            self.current_tree.fit(self.X, self.y)

            # i is used to count the number of updates a tree has done, j is used to update the value of epsilon
            # lastRun is used to count how many trees you want to build considering the optimal policy (epsilon = 0) after reaching the stopping criteria for training
            i = j = flag = lastRun = 0

            scores = []
            slidingWindowReward = []

            while (lastRun < self.num_trees_optimal_policy):

                print("Tree number: " + str(j))
                total_reward = 0
                epsilon = self.get_epsilon(j)
                current_state = self.env.reset()

                if (len(slidingWindowReward) == self.slidingWindow):
                    if (sum(slidingWindowReward) / len(slidingWindowReward) >
                            TEST_REWARD):
                        total_importance += self.current_tree.feature_importances_
                        print("Total importance: " + str(total_importance))
                        lastRun += 1
                        epsilon = 0  # in order to consider the optimal policy we set epsilon = 0
                    else:
                        slidingWindowReward = [
                        ]  # we clear the slidingWindow if the samples considered don't match our threshold (TEST_REWARD)

                while (i < self.update):

                    if ("Speech Emotion Recognition" not in self.features):
                        listx = list(current_state)
                        listx.remove(current_state[1])
                        tuplex = tuple(listx)
                        current_state = tuplex

                    if ("Object State" not in self.features):
                        listx = list(current_state)
                        listx.remove(current_state[2])
                        tuplex = tuple(listx)
                        current_state = tuplex

                    if (("Speech Emotion Recognition" in self.features)
                            and ("Object State" in self.features)):
                        current_state = current_state[self.initialIndex:self.
                                                      finalIndex]

                    action = self.choose_action(current_state, epsilon)
                    obs, reward, done, _ = self.env.step(action)
                    temp = obs

                    if ("Speech Emotion Recognition" not in self.features):
                        listx = list(obs)
                        listx.remove(obs[1])
                        tuplex = tuple(listx)
                        obs = tuplex
                    if ("Object State" not in self.features):
                        listx = list(obs)
                        listx.remove(obs[2])
                        tuplex = tuple(listx)
                        obs = tuplex
                    if (("Speech Emotion Recognition" in self.features)
                            and ("Object State" in self.features)):
                        obs = obs[self.initialIndex:self.finalIndex]

                    total_reward += reward

                    q_current = self.current_tree.predict([current_state])
                    q_new = self.current_tree.predict([obs])
                    q_current[0][action] = reward + self.gamma * np.max(
                        q_new[0])
                    self.buffer.append(
                        [current_state, action, obs, q_current[0]])
                    self.X.append(current_state)
                    self.y.append(q_current[0])

                    if (
                            not lastRun == 0
                    ):  # it means until we did not build self.num_trees_optimal_policy
                        self.X_final.append(current_state)
                        self.y_final.append(q_current[0])
                    current_state = temp
                    i += 1
                    if done:
                        current_state = self.env.reset()
                        if (not flag):
                            scores.append(total_reward)
                            flag = 1
                            if (lastRun == 0):
                                slidingWindowReward.append(self.test_reward())
                i = 0
                if (not flag):
                    scores.append(total_reward)
                    if (lastRun == 0):
                        slidingWindowReward.append(total_reward)

                self.current_tree = DecisionTreeRegressor()
                self.current_tree.fit(self.X, self.y)
                j += 1
                flag = 0

            # Testing considering all the variables but training the model on samples generated by last self.num_trees_optimal_policy with epsilon = 0
            print("Testing the tree considering all variables")
            self.current_tree = DecisionTreeRegressor()
            self.current_tree.fit(self.X_final, self.y_final)
            # I want to know the average reward after 25 run considering all variables
            avgRewAllFeatures.append(self.test_reward())

            # Visualize data
            if (self.MAX_LEN > 2 and self.update > 2):
                dot_data = StringIO()
                export_graphviz(self.current_tree,
                                out_file=dot_data,
                                filled=True,
                                rounded=True,
                                special_characters=True,
                                feature_names=self.features)
                graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
                Image(graph.create_png())
                graph.write_png("final_tree.png")

            # Now we build four trees (each one without considering one of the four features), we generate a test-set for each tree (so we generate some episodes)
            # From this episodes we compute the Monte-Carlo error that is (r + gamma * G_{t+1} - q^predicted(x,a))^{2} e we sum all these differences to compute the total error
            # Higher is this error, more important is the removed feature (so the feature we didn't consider)
            """for i in range(0, len(self.features)):
                print("Testing the final tree without considering " + str(self.features[i]))
                self.current_tree = DecisionTreeRegressor()
                X_feature = self.dataFilter(i)
                self.current_tree.fit(X_feature, self.y_final)
                r, e = self.getMonteCarloError(i)

                averageRewardFeatures[self.features[i]] = np.append(averageRewardFeatures[self.features[i]], r)
                confidenceIntervalFeatures[self.features[i]] = np.append(confidenceIntervalFeatures[self.features[i]], e)"""

            self.current_tree = DecisionTreeRegressor()
            self.current_tree.fit(self.X_final, self.y_final)
            r, e = self.getMonteCarloError(len(self.features))
            total_error = np.append(total_error, e)

            k += 1
        """"# This is the final average reward after 25 run considering all features
Esempio n. 3
0
train_data = np.delete(iris["data"], test_index, 0)
print(train_data)

# test_data

test_target = iris["target"][test_index]
test_data = iris["data"][test_index]

print(test_target)
print(test_data)
print(iris.feature_names)
print(iris.target_names)

clf = tree.DecisionTreeClassifier()

clf.fit(train_data, train_target)

print(clf.predict(test_data))

#viz code
dot_date = StringIO()

tree.export_graphviz(clf,
                     out_file=dot_date,
                     feature_names=iris.feature_names,
                     class_names=iris.target_names,
                     filled=True,
                     rounded=True,
                     special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_date.getvalue())
graph.write_png("iris.png")
Esempio n. 4
0
    RMSECvAll.append(math.sqrt( sum( (Originaly-PredictedYcv)**2 )/ OriginalX.shape[0]))
plt.figure()
plt.plot(CandidatesOfDTDepth, RMSECvAll, 'k', linewidth=2)
plt.xlabel("Depth of tree for DT")
plt.ylabel("RMSE in CV for DT")
plt.show()
OptimalMaxDepthDT = CandidatesOfDTDepth[np.where( RMSECvAll == np.min(RMSECvAll) )[0][0] ]
DTResult = tree.DecisionTreeRegressor(max_depth=OptimalMaxDepthDT, min_samples_leaf=MinSamplesLeafDT)
DTResult.fit( OriginalX, Originaly )
CalculatedYAll[:,7] = DTResult.predict(OriginalX)
np.random.seed(10000)
PredictedYcvAll[:,7] = model_selection.cross_val_predict(DTResult, OriginalX, Originaly, cv=FoldNumber)
np.random.seed()
# Check rules of DT
datapdDT = pd.read_csv("data.csv", encoding='SHIFT-JIS', index_col=0)
with contextlib.closing(StringIO()) as DTfile:
    tree.export_graphviz(DTResult, out_file=DTfile,
                         feature_names=datapdDT.columns[1:],
                         class_names=datapdDT.columns[0])
    output = DTfile.getvalue().splitlines()
output.insert(1, 'node[fontname="meiryo"];')
with open('DTResult.dot', 'w') as f:
    f.write('\n'.join(output))
# Estimate Y for new samples based on DT in 1. and 2.
PredictedY1All[:,7] = DTResult.predict(OriginalX_prediction1)
PredictedY2All[:,7] = DTResult.predict(OriginalX_prediction2)

# 9. Random Forest (RF)
NumberOfTreesRF = 500 # 1. Number of decision trees
CandidatesOfXvariablesRateRF = np.arange( 1, 10, dtype=float)/10 #candidates of the ratio of the number of explanatory variables (X) for decision trees
# Run RFR for every candidate of X-ratio and estimate values of objective variable (Y) for Out Of Bag (OOB) samples
print("")
print("Data 1's Upper p value with Gini -> ", gini_upper_p_data1)
print("")
print("Data 2's Lower p value with Gini -> ", gini_lower_p_data2)
print("")
print("Data 2's Upper p value with Gini -> ", gini_upper_p_data2)
print("")

#Part i

data_1_col_names = [
    "age", "job", "marital", "education", "balance", "housing", "duration",
    "poutcome"
]
data_2_col_names = ["job", "marital", "education", "housing"]
dot_data1_entropy = StringIO()
dot_data2_entropy = StringIO()
dot_data1_gini = StringIO()
dot_data2_gini = StringIO()
export_graphviz(entropy_data_1,
                out_file=dot_data1_entropy,
                filled=True,
                rounded=True,
                special_characters=True,
                feature_names=data_1_col_names,
                class_names=["0", "1"])
data_1_entropy_graph = pydotplus.graph_from_dot_data(
    dot_data1_entropy.getvalue())
data_1_entropy_graph.write_png('data_1_entropy.png')
Image(data_1_entropy_graph.create_png())
Esempio n. 6
0
    def train():

        balance_data_excel = DecisionTreeQuestionnaire.get_csv_file_data()
        ''' Clean the Data and replace with nan
        '''
        balance_data_excel = balance_data_excel.replace(r'^\s*$',
                                                        str(np.nan),
                                                        regex=True).replace(
                                                            '', str(np.nan))
        balance_data_excel = balance_data_excel.applymap(str)
        balance_data_excel

        print("Dataset Length:: ", len(balance_data_excel))
        print("Dataset Shape:: ", balance_data_excel.shape)

        X = balance_data_excel.iloc[:, :-1]
        y = balance_data_excel.iloc[:, 22]

        X = DecisionTreeQuestionnaire.encode_onehot(
            X,
            X.columns.get_values().tolist())
        X.head()

        le_y = LabelEncoder()
        y = le_y.fit_transform(y)

        cols = X.columns
        for c in cols:
            x = c
            if x.split('=')[1] == 'nan':
                X.drop(c, axis=1, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=100)

        clf_entropy = DecisionTreeClassifier(criterion="entropy",
                                             random_state=100,
                                             max_depth=100,
                                             min_samples_leaf=5,
                                             min_samples_split=8)
        abc = clf_entropy.fit(X_train, y_train)

        y_pred = clf_entropy.predict(X_test)

        print("Accuracy is ", accuracy_score(y_test, y_pred) * 100)
        ''' Convert Target Classes to key-value pairs
        '''
        print("le2")
        class_names = {}
        for i in range(len(le_y.classes_)):
            class_names[i] = le_y.classes_[i]
        print(class_names)

        features = {}
        for i in range(len(list(X.columns[0:56]))):
            features[i] = X.columns[i]

        #features

        for i in range(len(list(X.columns[0:56]))):
            DecisionTreeQuestionnaire.feature_names.append(X.columns[i])
        #feature_names

        dot_data = StringIO()
        export_graphviz(clf_entropy,
                        out_file=dot_data,
                        filled=True,
                        rounded=True,
                        special_characters=True,
                        feature_names=features,
                        class_names=class_names)

        # graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph = pydot.graph_from_dot_data(dot_data.getvalue())
        # graph_from_dot_file()

        graph.write_png('decisiontree.png')

        DecisionTreeQuestionnaire.tree_to_code(abc)

        DecisionTreeQuestionnaire.tree_to_code2(abc, class_names)

        DecisionTreeQuestionnaire.paths

        DecisionTreeQuestionnaire.isModelTrained = True

        for i in DecisionTreeQuestionnaire.paths:
            for k, v in i.items():
                print(k + ' : ' + v)
            print("-----------------------------------")

            print('Your inputs are not defined')
Esempio n. 7
0
    def decision_tree_training(self):

        self.target_names = ['lying', 'lie on the side', 'sitting', 'standing']
        self.feature_names = [
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
            '24', '25', '26', '27', '28', '29', '30', '31', '32', '33'
        ]
        print('---start training decision tree---')
        #split dataset in two equal parts
        #print(np.shape(self.feature), np.shape(self.label))
        X_train, X_test, Y_train, Y_test = train_test_split(self.feature,
                                                            self.label,
                                                            test_size=0.25,
                                                            random_state=0)
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/X_train.txt',
            X_train,
            fmt='%f')
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/X_test.txt',
            X_test,
            fmt='%f')
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/Y_train.txt',
            Y_train,
            fmt='%d')
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/Y_test.txt',
            Y_test,
            fmt='%d')
        print('---split data done!---')
        print()
        clf = DecisionTreeClassifier(criterion='gini',
                                     random_state=0)  # 默认使用CART算法
        print(np.shape(X_train), np.shape(Y_train.ravel()))
        clf.fit(X_train, Y_train.ravel())
        # cross_val_score(classifier, X_train, Y_train, cv=5)
        # visualization
        dot_data = StringIO()
        tree.export_graphviz(clf,
                             out_file=dot_data,
                             feature_names=self.feature_names,
                             class_names=self.target_names,
                             filled=True,
                             rounded=True,
                             impurity=False)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf("decision_tree.pdf")
        # classifier.fit(X_train, Y_train)
        #验证测试集
        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        Y_true, Y_pred = Y_test.ravel(), clf.predict(X_test)
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/Y_true.txt',
            Y_true,
            fmt='%d')
        np.savetxt(
            '/home/hts/posture_classification_based_pose/decision_tree/Y_pred.txt',
            Y_pred,
            fmt='%d')
        print(
            classification_report(Y_true,
                                  Y_pred,
                                  target_names=self.target_names))
        print()

        print('Decision Tree model saving ......')
        model_save_path = '/home/hts/posture_classification_based_pose/decision_tree/train_decision_tree_model.m'
        joblib.dump(clf, model_save_path)

        return
Esempio n. 8
0
kyphosis_train = kyphosis[kyphosis.is_train]
kyphosis_test = kyphosis[kyphosis.is_train == False]

# Train model
kyphosis_features = kyphosis.columns[1:]
kyphosis_dt_clf = DecisionTreeClassifier(criterion='entropy',
                                         max_depth=None,
                                         min_samples_split=2,
                                         min_samples_leaf=1)
kyphosis_dt_clf = kyphosis_dt_clf.fit(kyphosis_train[kyphosis_features],
                                      kyphosis_train['Kyphosis'])

# Print a string representation of the tree.
# If you have graphviz (www.graphviz.org) installed, you can write a pdf
# visualization using graph.write_pdf(filename)
kyphosis_dt_data = StringIO()
tree.export_graphviz(kyphosis_dt_clf, out_file=kyphosis_dt_data)
kyphosis_dt_graph = pydotplus.parser.parse_dot_data(
    kyphosis_dt_data.getvalue())
print(kyphosis_dt_graph.to_string())

# Predict classes of test set and evaluate
kyphosis_dt_pred = kyphosis_dt_clf.predict(kyphosis_test[kyphosis_features])

kyphosis_dt_cm = metrics.confusion_matrix(kyphosis_test['Kyphosis'],
                                          kyphosis_dt_pred,
                                          labels=['absent', 'present'])
print(kyphosis_dt_cm)
kyphosis_dt_acc = metrics.accuracy_score(kyphosis_test['Kyphosis'],
                                         kyphosis_dt_pred)
kyphosis_dt_prec = metrics.precision_score(kyphosis_test['Kyphosis'],
Esempio n. 9
0
from sklearn import tree

iris = load_iris()
test_idx = [0, 50, 100]

train_target = np.delete(iris.target, test_idx)
train_data = np.delete(iris.data, test_idx, axis=0)

test_target = iris.target[test_idx]
test_data = iris.data[test_idx]

clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_target)

print(test_target)
print(clf.predict(test_data))

from sklearn.externals.six import StringIO
import pydotplus

dotdata = StringIO()
tree.export_graphviz(
    clf,
    out_file=dotdata,
    feature_names=iris.feature_names,
    class_names=iris.target_names,
    filled=True, rounded=True,
    impurity=False)
graph = pydotplus.graph_from_dot_data(dotdata.getvalue())
graph.write_pdf("target.pdf")
    def customer_segment(customerslist, productname):
        g = globals()



        xx='match(c:customerid)-[r:Bought_this]->(s:stockcode) ' \
           'set r.Actual_Price=toFloat(r.Actual_Price)' \
           'return s.Category,collect(DISTINCT r.Actual_Price) as pricelist ' \
           'order by s.Category, pricelist'
        pricebucket = session.run(xx)

        for i in pricebucket:
            # print(i)
            actualcategoryspresent.append(i['s.Category'])
            if len(i['pricelist']) == 1:
                start = (i['pricelist'][0]) - 1
                end = (i['pricelist'][0]) + 1
            else:
                start = min(i['pricelist'])
                end = max(i['pricelist'])

            bucketsize = (end - start) / 10
            starts.append(start)
            ends.append(end)
            bucketsizes.append(bucketsize)
            cat.append(i['s.Category'])
            # print('start      :',start)
            # print('end        :',end)
            # print('bucket_size:',bucketsize)
        for i in customerslist:
            x = 'optional MATCH(c:customerid{CustomerID: "' + i + '"})-[r:Bought_this]->(s:stockcode{Description: "' + productname + '"}) ' \
                 'return distinct ' \
                 'case ' \
                 'when r.Quantity IS NULL THEN 0 ' \
                 'when r.Quantity IS NOT NULL THEN 1 ' \
                 'else r.Quantity END AS Quantity '
            for ii in session.run(x):
                target.append(ii[0])
        for i in customerslist:
            x = 'match(c1:customerid{CustomerID: "' + i + '"})-[r1:Bought_this]->(s1:stockcode) return c1.Country limit 1'
            result = session.run(x)
            for ii in result:
                countryofcustomer.append(ii[0])
        for i in customerslist:
            g[i + ' vector'] = [0] * len(cat)
            g[i + ' category-wise purchase vector'] = [0] * len(cat)
            g[i + ' bucket_vector'] = [0] * 4


            x='match(c1:customerid{CustomerID: "'+i+'"})-[r1:Bought_this]->(s1:stockcode) ' \
              'set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price)' \
              'return c1.CustomerID as CustomerID, s1.Category as Category,reduce(sum=0, i in collect(r1.Quantity * r1.Price) | sum + i) as totalspent , reduce(sum=0, i in collect(r1.Quantity) | sum + i) as Quantity ' \
              'order by totalspent desc ' \

            result = session.run(x)
            count = 0
            for i in result:
                count = count + 1
                if count == 1:

                    totalspent.append(i['totalspent'])
                    category.append(i['Category'])
                    #print(i['totalspent'])
                    if i['Category'] in cat:
                        vv = cat.index(i['Category'])
                        g[i['CustomerID'] +
                          ' category-wise purchase vector'][vv] = i['Quantity']

                else:
                    if i['Category'] in cat:
                        vv = cat.index(i['Category'])
                        g[i['CustomerID'] +
                          ' category-wise purchase vector'][vv] = i['Quantity']

        for i in range(0, len(customerslist)):
            x='match(c1:customerid{CustomerID:"'+customerslist[i]+'"})-[r1:Bought_this]->(s1:stockcode{Category:"'+category[i]+'"}) ' \
              'set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price) ' \
              'return c1.CustomerID as CustomerID,collect(r1.Actual_Price) as prices_in_category'
            result = session.run(x)

            for yy in result:
                # print(yy[1])
                most_buyed_item_cost.append(float(Most_Common(yy[1])))

        for i in customerslist:
            dd = 'match(c1:customerid{CustomerID: "' + i + '"})-[r1:Bought_this]->(s1:stockcode) set r1.Quantity=toInteger(r1.Quantity),r1.Price=toFloat(r1.Price) return c1.CustomerID as CustomerID, collect( distinct s1.Category) as Categoryss'
            result = session.run(dd)
            for uu in result:
                for hh in uu['Categoryss']:
                    #print(hh)

                    if hh in cat:
                        vv = cat.index(hh)
                        g[i + ' vector'][vv] = 1

        for t in range(0, len(customerslist)):

            if category[t] in cat:
                vv = cat.index(category[t])
            l = drop_in_bucket(starts[vv], ends[vv], bucketsizes[vv],
                               most_buyed_item_cost[t])
            pricesensi.append(l)
        ageassign(pricesensi)
        df = pd.DataFrame(columns=[
            'age', 'p_s', 'category', 'totalspent', 'total_cat_bought',
            'country'
        ])
        for t in range(0, len(customerslist)):
            #print(t)
            if category[t] in cat:
                vv = cat.index(category[t])
            # print('------------------------------------------------------------------------------------------------------------------')
            # print('customer                                         :',customerslist[t])
            # print('customer age                                     :',ages[t])
            # print('country                                          :',countryofcustomer[t])
            # print('total spent                                      :',totalspent[t])
            # print('category spent                                   :',category[t])
            # print('customer category vector                         :',g[customerslist[t]+' vector'])
            if pricesensi[t] == 'High':
                g[customerslist[t] + ' bucket_vector'][0] = 1
            if pricesensi[t] == 'Medium High':
                g[customerslist[t] + ' bucket_vector'][1] = 1
            if pricesensi[t] == 'Medium Low':
                g[customerslist[t] + ' bucket_vector'][2] = 1
            if pricesensi[t] == 'Low':
                g[customerslist[t] + ' bucket_vector'][3] = 1
            # print('customer category-wise purchase vector           :', g[customerslist[t] + ' category-wise purchase vector'])
            # print('customer bucket vector                           :',g[customerslist[t] +' bucket_vector'])
            # print('starting price of that category                  :',starts[vv])
            # print('ending price of that category                    :',ends[vv])
            # print('bucket size of category                          :',bucketsizes[vv])
            # print('most buyed item cost                             :',most_buyed_item_cost[t])
            # print('price sensitivity                                :',pricesensi[t])
            # print('total catogories bought                          :',sum(g[customerslist[t]+' vector']))
            # print('Yes/No                                           :',target[t])
            df = df.append(
                {
                    'age': ages[t],
                    'p_s': pricesensi[t],
                    'category': category[t],
                    'totalspent': totalspent[t],
                    'total_cat_bought': int(
                        sum(g[customerslist[t] + ' vector'])),
                    'country': countryofcustomer[t]
                },
                ignore_index=True)

        # print(df)
        df = df.drop('country', axis=1)

        hot_ps = pd.get_dummies(df.p_s)
        df = df.join(hot_ps)
        df = df.drop('p_s', axis=1)

        hot_category = pd.get_dummies(df.category)
        df = df.join(hot_category)
        df = df.drop('category', axis=1)

        # hot_country = pd.get_dummies(df.country)
        # df=df.join(hot_country)
        # df=df.drop('country',axis=1)

        data = df.values
        train_target = target
        train_data = data
        # print(train_target)
        x = tree.DecisionTreeClassifier()
        print("##################____________________productname", productname)
        print(train_data)
        print(train_target)
        x.fit(train_data, train_target)
        dot_data = StringIO()
        tree.export_graphviz(x,
                             out_file=dot_data,
                             feature_names=df.columns.tolist(),
                             class_names=['No', 'Yes'],
                             filled=True,
                             rounded=True,
                             impurity=False)
        # tree.export_graphviz(x,out_file='tree.dot')
        for i in dot_data:
            print(i)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        # print(graph)
        #os.remove("D:\E-commerce_data_visualization\search\static\stuff/segment.png")
        # os.remove(file) for file in os.listdir('path/to/directory') if file.endswith('.png')
        # os.system(rm ,"D:\E-commerce_data_visualization\search\static\stuff/segment.png")
        graph.write_png(
            os.path.join(base_dir, 'search/static/stuff/segment.png'))
        print(productname, "productname727")
Esempio n. 11
0
from IPython.display import Image

import os
import sys


def conda_fix(graph):
    path = os.path.join(sys.base_exec_prefix, "Library", "bin", "graphviz")
    paths = ("dot", "twopi", "neato", "circo", "fdp")
    paths = {p: os.path.join(path, "{}.exe".format(p)) for p in paths}
    graph.set_graphviz_executables(paths)


from sklearn import tree

buffer = StringIO()
tree.export_graphviz(dt,
                     out_file=buffer,
                     feature_names=X.columns,
                     class_names=X.columns,
                     filled=True,
                     rounded=True,
                     special_characters=True)
graph = pydotplus.graph_from_dot_data(buffer.getvalue())
conda_fix(graph)
graph.write_pdf("loan_tree.pdf")
Image(graph.create_png())
#ada-booster #boosting
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(base_estimator=dt,
    def product_segment(productslist, customerid):
        descriptions = []
        mrp = []
        categoryofproduct = []
        yes_no = []
        cat = []
        xx='match(c:customerid)-[r:Bought_this]->(s:stockcode) ' \
           'set r.Actual_Price=toFloat(r.Actual_Price)' \
           'return s.Category,collect(DISTINCT r.Actual_Price) as pricelist ' \
           'order by s.Category, pricelist'
        pricebucket = session.run(xx)

        for i in pricebucket:
            # print(i)
            actualcategoryspresent.append(i['s.Category'])
            if len(i['pricelist']) == 1:
                start = (i['pricelist'][0]) - 1
                end = (i['pricelist'][0]) + 1
            else:
                start = min(i['pricelist'])
                end = max(i['pricelist'])

            bucketsize = (end - start) / 10
            starts.append(start)
            ends.append(end)
            bucketsizes.append(bucketsize)
            cat.append(i['s.Category'])
            # print('start      :',start)
            # print('end        :',end)
            # print('bucket_size:',bucketsize)

        for i in productslist:
            x='match() - [r:Bought_this]->(s:stockcode{StockCode:"'+i+'"}) ' \
              'return distinct s.Description as description, s.Category as category, r.Actual_Price as MRP ' \
              'limit 1'
            result = session.run(x)
            for i in result:
                descriptions.append(i['description'])
                mrp.append(i['MRP'])
                categoryofproduct.append(i['category'])
        df1 = pd.DataFrame(
            columns=['Description', 'MRP', 'category', 'PriceBucket'])
        for i in productslist:
            x = 'optional MATCH(c:customerid{CustomerID: "' + customerid + '"})-[r:Bought_this]->(s:stockcode{StockCode: "' + i + '"}) ' \
                'return distinct ' \
                'case ' \
                'when r.Quantity IS NULL THEN 0 ' \
                'when r.Quantity IS NOT NULL THEN 1 ' \
                'else r.Quantity END AS Quantity '
            for ii in session.run(x):
                yes_no.append(ii[0])
        for i in range(0, len(productslist)):
            # print('=======================================================================================================')
            # print('StockCode of product                             :',productslist[i])
            # print('Description of product                           :',descriptions[i])
            # print('MRP                                              :',mrp[i])
            # print('Category of product                              :',categoryofproduct[i])
            # if categoryofproduct[i] in cat:
            vv = cat.index(categoryofproduct[i])
            z = price_bucket_of_product(starts[vv], ends[vv], bucketsizes[vv],
                                        mrp[i])
            # print('Price bucket                                     :',z)
            # print('yes_no                                           :',yes_no[i])

            df1 = df1.append(
                {
                    'Description': descriptions[i],
                    'MRP': mrp[i],
                    'category': categoryofproduct[i],
                    'PriceBucket': z
                },
                ignore_index=True)

        # hot_descp = pd.get_dummies(df1.Description)
        # df1 = df1.join(hot_descp)
        df1 = df1.drop('Description', axis=1)

        hot_category = pd.get_dummies(df1.category)
        df1 = df1.join(hot_category)
        df1 = df1.drop('category', axis=1)

        hot_PriceBucket = pd.get_dummies(df1.PriceBucket)
        df1 = df1.join(hot_PriceBucket)
        df1 = df1.drop('PriceBucket', axis=1)
        # print(df1)
        data = df1.values
        train_target = yes_no
        train_data = data
        # print(train_target)
        # print(train_data)

        x = tree.DecisionTreeClassifier()

        x.fit(train_data, train_target)
        # print(x)
        dot_data = StringIO()
        tree.export_graphviz(x,
                             out_file=dot_data,
                             feature_names=df1.columns.tolist(),
                             class_names=['No', 'Yes'],
                             filled=True,
                             rounded=True,
                             impurity=False)

        for i in dot_data:
            print(i)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

        graph.write_png('D:\project\search\static\stuff/segment.png')
            clf_tree = DecisionTreeClassifier(max_depth=depth,
                                              min_samples_split=split,
                                              min_samples_leaf=leaf)
            temp = []
            for tr, te in sk_fold.split(X, y):
                clf_tree.fit(X[tr], y[tr])
                acc = clf_tree.score(X[te], y[te])
                tmp.append(acc)
                avg_acc = np.mean(np.array(acc))

# DT modeling (max_depth=10, min_samples_split=50, min_samples_leaf=50)
clf_tree = DecisionTreeClassifier(max_depth=10,
                                  min_samples_split=50,
                                  min_samples_leaf=50)
clf_tree.fit(X_tr, Y_tr)
DT_pred = clf_tree.predict(X_te)
confusion_matrix(Y_te, DT_pred)
clf_tree.score(X_te, Y_te)
print(classification_report(Y_te, DT_pred))

# DT plot
dot_file = StringIO()
tree.export_graphviz(clf_tree, out_file=dot_file)
tree.export_graphviz(clf_tree, out_file=dot_file, filled=True)

dot_file.getvalue()

graph = pydotplus.graph_from_dot_data(dot_file.getvalue())
graph[-1].write_pdf(r'C:\Users\User\Desktop\moviedata\tree.pdf')
Image(graph[1].create_png())
Esempio n. 14
0
def _decision_tree_regression_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]), 
        criterion='mse',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    regressor = DecisionTreeRegressor(criterion, splitter, max_depth,
                                      min_samples_split, min_samples_leaf,
                                      min_weight_fraction_leaf, max_features,
                                      random_state, max_leaf_nodes,
                                      min_impurity_decrease,
                                      min_impurity_split, presort)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(regressor,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    feature_importance = regressor.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = regressor.max_features_
    model['n_features'] = regressor.n_features_
    model['n_outputs'] = regressor.n_outputs_
    model['tree'] = regressor.tree_
    get_param = regressor.get_params()
    model['parameters'] = get_param
    model['regressor'] = regressor

    # report

    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    # Add tree plot

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Regression Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Esempio n. 15
0
def draw_tree(model, name):
    dot_data = StringIO()
    _tree.export_graphviz(model, out_file=dot_data)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(name + ".pdf")
def train_holdout(base_dir, classifier_name, classifier):
    base = pd.read_csv(f'{base_dir}/features.csv', sep=';', header=None)
    # fetch folder from first index of base
    images_dirs = base.iloc[:, 0]
    # Separate X to a new DataFrame and convert to numpy array
    X = base.iloc[:, 1:]

    # Load classes names
    y = pd.read_csv(f'{base_dir}/Y.csv', sep=';', header=None)
    # HOLDOUT
    # separate the base in 70% to train and 30% to test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,
                                                        random_state=42, stratify=y)

    all_classes = np.unique(y.to_numpy())

    def overfitting_prevent_train(X_train, y_train):
        # separate the train base in 70% to train and 30% do validation
        X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=.2,
                                                                        random_state=42, stratify=y_train)

        chunk_size = 10

        graphic_data = []
        current_epoch = 0

        alpha = 0.8
        smooth_error_training = []
        smooth_error_validation = []
        overfitting_count = 0

        for chunk in chunk_generator(pd.concat([X_train, y_train], axis=1), chunk_size=chunk_size):
            classifier.partial_fit(chunk[0], np.ravel(chunk[1]), classes=all_classes)

            current_error_on_training = 1 - classifier.score(X_train, np.ravel(y_train))
            current_error_on_validation = 1 - classifier.score(X_validation, np.ravel(y_validation))

            graphic_data.append(
                GraphicData(x=current_epoch,
                            error_on_val=current_error_on_validation,
                            error_on_train=current_error_on_training)
            )

            if current_epoch == 0:
                smooth_error_training.append(current_error_on_training)
                smooth_error_validation.append(current_error_on_validation)
            else:
                smooth_error_training.append(
                    alpha * smooth_error_training[current_epoch - 1] + (1 - alpha) * graphic_data[
                        current_epoch - 1].error_on_train)
                smooth_error_validation.append(
                    alpha * smooth_error_validation[current_epoch - 1] + (1 - alpha) * graphic_data[
                        current_epoch - 1].error_on_val)

            if current_epoch >= 1:
                if smooth_error_validation[-2] - smooth_error_validation[-1] < 1e-03:
                    overfitting_count += 1
                    if overfitting_count >= 5:
                        print(f'Overfitting Detected on {classifier_name}, epoch: {current_epoch}')
                        break
                    else:
                        overfitting_count = 0

            current_epoch += 1

        print(f'Acurácia {classifier_name} Validação: {classifier.score(X_validation, np.ravel(y_validation))}')

        plt.title(f'Error Compare --> {classifier_name} --> {base_dir}')
        # plt.plot([data.x for data in graphic_data],
        #          [data.error_on_train for data in graphic_data], label='Error on Train')

        # plt.plot([data.x for data in graphic_data],
        #          [data.error_on_val for data in graphic_data], label='Error on Validation')

        plt.plot([data.x for data in graphic_data],
                 smooth_error_training, label='Error on Train Smooth')
        plt.plot([data.x for data in graphic_data],
                 smooth_error_validation, label='Error on Validation Smooth')

        plt.ylabel('Error')
        plt.xlabel('Epoch')
        plt.legend()
        plt.show()

    def normal_fit():
        classifier.fit(X_train, y_train)

    def visualizate_data():
        x_tsne = TSNE(n_components=2).fit_transform(X)

        y_aux = pd.DataFrame()
        y_aux['classes'] = y[0]

        tsne_df = pd.DataFrame()
        tsne_df['tsne-x'] = x_tsne[:, 0]
        tsne_df['tsne-y'] = x_tsne[:, 1]
        tsne_df = pd.concat([tsne_df, y_aux], axis=1)

        plt.figure(figsize=(16, 10))
        sns.scatterplot('tsne-x', 'tsne-y',
                        hue="classes",
                        legend='full',
                        palette=sns.color_palette("hls", 10),
                        alpha=0.3,
                        data=tsne_df)
        plt.show()

    if hasattr(classifier, 'partial_fit'):
        overfitting_prevent_train(X_train, y_train)
    else:
        normal_fit()

    predicated_rows = classifier.predict(X_test)
    predicated_proba = classifier.predict_proba(X_test)

    # aux for find proba for class
    aux = 0
    for predicated, expected in zip(predicated_rows, y_test.iterrows()):
        if expected[1][0] != predicated:
            img_dir = images_dirs[expected[0]]
            percent = predicated_proba[aux][np.where(all_classes == predicated)]
            print(f'Confundiu {img_dir} com {predicated}, proba: {percent}')
        aux += 1

    print(f'Acurácia {classifier_name} Teste : {classifier.score(X_test, y_test)}')

    cm = confusion_matrix(y_test, predicated_rows)
    df_cm = pd.DataFrame(cm, index=all_classes, columns=all_classes)
    sns.heatmap(df_cm, annot=True)
    plt.title(f'Confusion Matrix --> {classifier_name} --> {base_dir}')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

    # visualizate_data()

    if isinstance(classifier, DecisionTreeClassifier):
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydot

        dot_data = StringIO()
        export_graphviz(classifier, out_file=dot_data, rounded=True,
                        filled=True)
        graph = pydot.graph_from_dot_data(dot_data.getvalue())[0]
        graph.write_pdf(f'{classifier_name}.pdf')

    return classifier, X_test, y_test
Esempio n. 17
0
def wlasne_drzewo():
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("my-tree.pdf")
Esempio n. 18
0
tree.fit(x_train, y_train)
print(tree.score(x_train, y_train))
print(tree.score(x_test, y_test))


# (fix it...)
from sklearn.tree import export_graphviz

export_graphviz(tree, out_file='cancertree.dot', class_names=['m','b'], feature_names=cancer.feature_names, filled=True)


import pydot
import graphviz
from sklearn.externals.six import StringIO 

dotfile = StringIO()
export_graphviz(tree, out_file=dotfile, class_names=['m','b'], feature_names=cancer.feature_names, filled=True)
pydot.graph_from_dot_data(dotfile.getvalue()).write_png("dtree2.png")

import matplotlib.image as mpimg

img = mpimg.imread('dtree2.png.png')
plt.imshow(img)
plt.show()

from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
pima = pd.read_csv("C:/Users/Chaitali/Desktop/SIMLP 2019/diabetes.csv", header=None, names=col_names)
print(pima.head())
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols]
y = pima.label 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('diabetes.png')
Image(graph.create_png())
array_severity = df['severity'].values

scaler = preprocessing.StandardScaler().fit(array_asmd)
rescaled_asmd = scaler.transform(array_asmd)
#print(rescaled_asmd)

rescaled_asmd_train, rescaled_asmd_test, severity_train, severity_test = train_test_split(
    rescaled_asmd, array_severity, test_size=0.25, random_state=1)
#print(severity_train)

clf_gini = DecisionTreeClassifier()
clf_gini.fit(rescaled_asmd_train, severity_train)

#print(clf_gini.score(rescaled_asmd_test,severity_test))

out = StringIO()
tree.export_graphviz(clf_gini,
                     out_file=out,
                     feature_names=['age', 'shape', 'margin', 'density'])
graph = graph_from_dot_data(out.getvalue())
Image(graph.create_png())

kcv_score = cross_val_score(clf_gini, rescaled_asmd, array_severity, cv=10)
#print(kcv_score.mean())  #0.73

clf_rf = RandomForestClassifier()
clf_rf.fit(rescaled_asmd_train, severity_train)

rf_score = cross_val_score(clf_rf, rescaled_asmd, array_severity, cv=10)
#print(rf_score.mean()) #0.769