def _build_bias_tree(self, attributes: dict,
                      metric_with_metadata: pd.DataFrame) -> None:
     self.bias_tree = Tree.from_pandas_df(
         metric_with_metadata,
         attributes,
         self.metric_col,
         min_child_node_size=self.min_child_node_size,
         dep_variable_type=self.metric_type,
         max_depth=self.max_depth,
         alpha_merge=self.alpha,
         split_threshold=self.split_threshold)
예제 #2
0
    def train(self, x: np.ndarray, y: np.array):
        """
        Train CHAID classifier

        Parameters
        ----------
        x: np.array
            Train data set

        y: np.array
            Train target data
        """
        self.model = Tree.from_numpy(ndarr=x,
                                     arr=y,
                                     alpha_merge=self.alpha_merge,
                                     max_depth=self.max_depth,
                                     min_parent_node_size=self.min_parent_node,
                                     min_child_node_size=self.min_child_node,
                                     split_threshold=self.split_threshold,
                                     dep_variable_type='categorical')
예제 #3
0
import numpy as np
from sklearn import datasets
from sklearn import tree
from sklearn.tree import _tree

df = pd.read_csv(
    'C:\\Users\\LAC40641\\Desktop\\ejemplo arbol\\prueba_chaid.csv',
    encoding='latin-1')

le.fit(df['Sucursal_Oficial'].values)
X = np.array(le.transform(df['Sucursal_Oficial']))
y = np.array(df['Sueldo_Normalizado'])

xx = np.array(X).reshape(len(X), 1)

tree = Tree.from_numpy(xx, y, split_titles=['a'], min_child_node_size=1)

df_p = pd.DataFrame(list(zip(X, y.values)),
                    columns=['lst1_title', 'lst2_title'])
df_p.columns = ['Sucursal_Oficial', 'Sueldo_Normalizado']

independent_variable_columns = ['Sucursal_Oficial']
dep_variable = 'Sueldo_Normalizado'

tree = Tree.from_pandas_df(df_p,
                           dict(
                               zip(independent_variable_columns,
                                   ['nominal'] * 3)),
                           dep_variable,
                           dep_variable_type='continuous')
예제 #4
0
def churn_clusters(data, dependent = "churn"):
    from CHAID import Tree
    indep_cols = list(set(data.columns.tolist()) - set(dependent))
    tree = Tree.from_pandas_df(data, dict(zip(indep_cols,
        ['nominal'] * len(indep_cols))), dependent, min_child_node_size=5)
    print(tree.print_tree())
예제 #5
0
# Discretizar

df['edad'] = pd.cut(df['edad'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 90])

# Guardar Base
df.to_csv('data.csv')

df['OFICINA'] = df['OFICINA'].str.strip()
independent_variable_columns = ['OFICINA']
dep_variable = 'Ventas_IT'
minsplit = len(df['Ventas_IT']) * 0.03
prueba = Tree.from_pandas_df(df,
                             dict(
                                 zip(independent_variable_columns,
                                     ['nominal'] * 1)),
                             dep_variable,
                             dep_variable_type='continuous',
                             max_depth=5,
                             min_parent_node_size=minsplit)
prueba
prueba.print_tree()
prueba2 = prueba.tree_store

probando = predict(df, prueba)
prueba2
for i in prueba2:
    print(i.members['mean'])

rules = prueba.classification_rules()
rules
lenrules = len(rules)
예제 #6
0
mask_light = list(mask_light)
df_CHAID.drop(mask_light, inplace=True)

mask_vdamage = df_CHAID[df_CHAID['Vehicle Damage Extent'].isin(
    vehicle_damage_drops)].index
mask_vdamage = list(mask_vdamage)
df_CHAID.drop(mask_vdamage, inplace=True)

#defining Variables to use in the CHAID function
indep_var = ['Light']
dep_var1 = 'Injury Severity'
dep_var2 = 'Vehicle Damage Extent'

#Building the tree
tree1 = Tree.from_pandas_df(df_CHAID,
                            dict(zip(indep_var, ['nominal'] * 3)),
                            dep_var1,
                            max_depth=1)
tree2 = Tree.from_pandas_df(df_CHAID,
                            dict(zip(indep_var, ['nominal'] * 3)),
                            dep_var2,
                            max_depth=1)

#Converting the tree to an object to view it
tree1.to_tree()
tree2.to_tree()

#Printing out the different trees to see the results in raw form
tree1.print_tree()
tree2.print_tree()

#The ratios are the # of worst-case to # of other cases
예제 #7
0
    'Q2_4', 'Q2_5', 'Q2_6', 'Q2_7', 'Q2_8', 'Q2_9', 'Q2_10', 'Q3_1', 'Q3_2',
    'Q3_3', 'Q3_4', 'Q3_5', 'Q3_6', 'Q3_7', 'Q3_8', 'Q3_9', 'Q3_10'
]

len(features)

#=====================================
# create tree

features
dict(zip(features, ['ordinal'] * 34))

tree = Tree.from_pandas_df(df,
                           dict(zip(features, ['ordinal'] * 34)),
                           'clus',
                           dep_variable_type='categorical',
                           alpha_merge=0.05,
                           max_depth=2,
                           min_parent_node_size=1,
                           min_child_node_size=0,
                           split_threshold=0)

tree

tree.print_tree()

dir(tree)

dir(tree.get_node(3))
tree.get_node(0)._members
예제 #8
0
파일: Model.py 프로젝트: usmansd/MLStuff
## create the data
ndarr = np.array(([1, 2, 3] * 5) + ([2, 2, 3] * 5)).reshape(10, 3)
df = pd.DataFrame(ndarr)
df.columns = ['a', 'b', 'c']
arr = np.array(([1] * 5) + ([2] * 5))
df['d'] = arr

print (df)

## set the CHAID input parameters
independent_variable_columns = ['a', 'b', 'c']
dep_variable = 'd'

## create the Tree via pandas
tree = Tree.from_pandas_df(df, dict(zip(independent_variable_columns, ['nominal'] * 3)), dep_variable)
## create the same tree, but without pandas helper
tree = Tree.from_numpy(ndarr, arr, split_titles=['a', 'b', 'c'], min_child_node_size=5)
## create the same tree using the tree constructor
"""
cols = [
  NominalColumn(ndarr[:,0], name='a')
  NominalColumn(ndarr[:,1], name='b')
  NominalColumn(ndarr[:,2], name='c')
]
tree = Tree(cols, NominalColumn(arr, name='d'), {'min_child_node_size': 5})

"""

print (tree.print_tree())
예제 #9
0
def Supervised_Merged(file,
                      df,
                      Predictor_type,
                      dependent_variable_name,
                      indep_column_num,
                      Categorical=True):
    # Get the names of Independent and Dependent variables
    independent_variable_column = [df.columns[indep_column_num]]
    dep_variable = dependent_variable_name

    # Check for Target variable type to decide which CHAID TREE to implement
    if Categorical == True:

        # fit the Chaid tree model to supervised merged the categories in category predictor
        tree = Tree.from_pandas_df(df,
                                   dict(
                                       zip(independent_variable_column,
                                           [Predictor_type] * 1)),
                                   dep_variable,
                                   max_depth=1)

    else:

        # Convert the target variable to numeric
        df[dependent_variable_name] = pd.to_numeric(
            df[dependent_variable_name], errors='coerce')

        # fit the Chaid tree model to supervised merged the categories in category predictor
        tree = Tree.from_pandas_df(df,
                                   dict(
                                       zip(independent_variable_column,
                                           [Predictor_type] * 1)),
                                   dep_variable,
                                   dep_variable_type='continuous',
                                   max_depth=1)

    # Print the fitted tree
    file.write('The CHAID TREE is presented below:\n\n')
    file.write(str(tree) + '\n')

    # Get the merged categoriess string from the tree
    Merged_group = tree.tree_store[0].split.groupings.split('],')
    # Get numbers of merged caegroeis
    length_Merged_group = np.arange(0, len(Merged_group))

    if len(Merged_group) >= 2:

        # Etract the number from the string
        New_Merged_Categories = {}
        for i in length_Merged_group:
            group = list(map(int, re.findall(r'\d+', Merged_group[i])))
            New_Merged_Categories[i] = group
        file.write('The P-Values of this node is ' +
                   str(tree.tree_store[0].split.p) + '\n')
        file.write('The new categories are:\n')
        for k, v in New_Merged_Categories.items():
            file.write(str(k) + ' >>> ' + str(v) + '\n')

        # Convert the dict_format to match the previous dic
        # For example: new_merged: {0:[1,2,3,4,5],1:[6,7,8],2:[0,9]}
        #              map_dict: {0:2, 1:0, 2:0, 3:0, 4:0, 5:0, 6:1, 7:1, 8:1, 9:2}
        new_dict = {}
        length_New_Merged = np.arange(0, len(New_Merged_Categories))
        for j in length_New_Merged:
            values = New_Merged_Categories.get(j)
            for k in np.arange(0, len(values)):
                new_dict[values[k]] = j
    else:
        file.write('The P-Values of this node is ' +
                   str(tree.tree_store[0].split.p) + '\n')
        file.write('The P-values is too large.\n')
        file.write(
            'There is no categories can be merged in this variables.\n\n')
        new_dict = {}

    return new_dict
예제 #10
0
Independent_data = pd.read_csv("Independent_data.csv")
Response = pd.read_csv("Response.csv")
X_train, X_test, y_train, y_test = train_test_split(Independent_data,
                                                    Response,
                                                    test_size=0.25,
                                                    random_state=42)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

from CHAID import Tree
tree = Tree.from_pandas_df(
    test_data,
    dict(
        zip(X_train.columns.tolist(),
            list(np.repeat('nominal', len(X_train.columns))))),
    y_train.columns[0],
    max_depth=10,
    min_child_node_size=10)
tree.print_tree()
tree.classification_rules()


def predict(df, tree):
    rules = tree.classification_rules()
    lenrules = len(rules)
    j = 0
    df.index = range(0, df.shape[0])
    Response = np.repeat(0, df.shape[0])
    while (j <= lenrules - 1):
        r1 = rules[j]
예제 #11
0
    index, ztest_list = Z_test(data)
    figdata = get_figdata(index, ztest_list, kms)
    draw_radar(figdata, len(index), 'Z test', [0.5, 1, 1.5, 2, 2.5, 3, 3.5])

    ##### t-test
    index, ttest_list = T_test(data)
    figdata = get_figdata(index, ttest_list, kms)
    draw_radar(figdata, len(index), 'T test', [0.5, 1, 1.5, 2, 2.5, 3, 3.5])

    ##### Chi-square test
    index, chisq_list = Chi_Square_test(data)
    figdata = get_figdata(index, chisq_list, kms)
    draw_radar(figdata, len(index), 'Chi-Square test',
               [0.5, 1, 1.5, 2, 2.5, 3, 3.5])

    ####Chaid tree
    independent_variable_columns = data.columns[1:len(data.columns) - 1]
    d_variable = data.columns[-1]
    dic = {}
    for item in independent_variable_columns[1:len(data.columns) - 1]:
        dic.update({item: 'nominal'})

    tr = Tree.from_pandas_df(data.drop(data.columns[0], axis=1),
                             dic,
                             d_variable,
                             max_depth=8,
                             min_parent_node_size=2,
                             min_child_node_size=2)

    tr.print_tree()