Esempio n. 1
0
def compare_clf():
    c_for_svm = svm_configure()
    k_for_gini, k_for_ig = dt_configure()
    classifier = [
        SVC(kernel='linear', C=c_for_svm),
        DT(criterion='gini', max_leaf_nodes=k_for_gini),
        DT(criterion='entropy', max_leaf_nodes=k_for_ig),
        LDA()
    ]

    p, r, f = [], [], []
    metrics = ['average_precision_score', 'recall_score', 'f1_score']
    for i, clf in enumerate(classifier):
        y_pred = clf.fit(X_train, Y_train).predict(X_test)
        plot_confusion_matrix(Y_test, y_pred, classes=class_name, idx=i + 1)
        precision, recall, f1 = average_precision_score(Y_test, y_pred), \
                                recall_score(Y_test, y_pred, 'weighted'), \
                                f1_score(Y_test, y_pred, 'weighted')
        p.append(precision)
        r.append(recall)
        f.append(f1)
    plt.show()
    for j in range(3):
        plt.subplot(1, 3, j + 1)
        plt.bar(range(4), p)
        plt.xlabel(['SVM', 'DT-gini', 'DT-IG', 'LDA'])
        plt.title(metrics[j])
    plt.show()
def _find_best_model(x, y, z, params_grid, test_size, log_features=False):
    """
    Performs GridSearch on `params_grid`.
    
    PARAMETERS
    ----------
    - x (numpy array) : the input set of random variables, of shape (N, D1)
    - y (numpy array) : the target set of random variables, of shape (N, D2)
    - z (numpy array) : the conditioning set of random variables, of shape (N, D3)
    - params_grid (dict) : the hyperparameters to try out while performing grid search ; for more details,
                           look up `sklearn.model_selection.GridSearchCV`
    - test_size (float) : the proportion of samples to be used as test data 
    - log_features (bool, default=False) : if True 'log2' will be used as `max_features` for the Decision Tree
                                           Regressor provided there are atleast 10 features in the input
    
    RETURNS
    -------
    - the Decision Tree Regressor with the optimal value for `min_sample_split`.
    """
    model_input = _mix_merge_columns(x, z)
    if log_features and model_input.shape > 10:
        max_features = 'log2'
    else:
        max_features = 'auto'
    cv_splits = ShuffleSplit(n_splits=3, test_size=test_size)
    best_params = GridSearchCV(DT(max_features=max_features),
                               params_grid,
                               cv=cv_splits,
                               n_jobs=-1).fit(model_input, y).best_params_
    best_model = DT(**best_params)
    return best_model
    def trace_distribution(self, features_after, labels, u):
        ScoreDT = []
        DToriginal = []
        sampledfeatures = features_after.sample(u)
        index = sampledfeatures.index.tolist()

        combi = list(combinations(self.lis, 5))
        pl.figure(facecolor='white')
        for x in combi:
            features = sampledfeatures.loc[:, x]
            print(features)
            features_origin = sampledfeatures.loc[:, x]
            print(features_origin)
            features = features.reset_index(drop=True)
            features.loc[:, x] = Ratio().shuffle(features.loc[:, x])
            label = []
            for n in index:
                label.append(labels[n])

            clfTestDT = cross_val_score(DT(min_samples_split=5,
                                           random_state=2),
                                        features.values,
                                        label,
                                        cv=5).mean()
            ScoreDT.append(clfTestDT)

            clfTestDTorigin = cross_val_score(DT(min_samples_split=5,
                                                 random_state=2),
                                              features_origin.values,
                                              label,
                                              cv=5).mean()
            DToriginal.append(clfTestDTorigin)

        h = sorted(ScoreDT)
        fit = stats.norm.pdf(h, np.mean(h), np.std(h))
        # pl.plot(h,fit)#,label='Surrogates: mean=%0.2f'% np.mean(h))
        pl.hist(h, normed=True, label='Surrogates: mean=%0.2f' % np.mean(h))

        v = sorted(DToriginal)
        fit1 = stats.norm.pdf(v, np.mean(v), np.std(v))
        #pl.plot(v,fit1)#'-o',label='Real data: mean=%0.2f'% np.mean(v))
        pl.hist(v, normed=True, label='Real data: mean=%0.2f' % np.mean(v))
        pl.legend(bbox_to_anchor=(0., -0.12, 1., .102),
                  loc=3,
                  ncol=2,
                  mode="expand",
                  borderaxespad=0.)
        pl.title(
            'Surrogate data testing for %s random uniform samples and 5 features'
            % u)
        pl.show()
        print(np.mean(h), np.std(h))
        print(np.mean(v), np.std(v))
Esempio n. 4
0
 def __init__(self, model_name='KNN', params=[]):
     if model_name == 'KNN':
         if params != []:
             self.clf = KNN(n_neighbors=params[0])
         else:
             self.clf = KNN()
     elif model_name == 'DT':
         if params != []:
             self.clf = DT(max_depth=params[0])
         else:
             self.clf = DT()
     else:
         self.clf = SVC()
     iris = datasets.load_iris()
     self.X = iris.data
     self.y = iris.target
def train(X, y, args):
    print('start...')
    stime = time.time()
    clf = DT(random_state=10)
    clf.fit(X, y)

    return clf
Esempio n. 6
0
def DTpredictor(X_train, y_train, X_test):
    '''Logistic Regression Classifier
    Input traning data ,target, and test data
    Output prabability of each label for test data'''
    from sklearn.tree import DecisionTreeClassifier as DT
    from sklearn.model_selection import StratifiedShuffleSplit as SSS

    # cross validation using StratifiedShuffleSplit
    sss = SSS(n_splits=5, test_size=0.2, random_state=0)
    sss.get_n_splits(X_train, y_train)
    accuracy, logLoss, count = 0, 0, 0
    for train_ind, test_ind in sss.split(X_train, y_train):
        Xtrain, Xtest = X_train.iloc[train_ind], X_train.iloc[test_ind]
        ytrain, ytest = y_train[train_ind], y_train[test_ind]
        model = DT(random_state=1)
        model.fit(Xtrain, ytrain)
        y_pred = model.predict(Xtest)
        accuracy += metrics.accuracy_score(ytest, y_pred)
        logLoss += metrics.log_loss(ytest, y_pred)
        count += 1

    y_pred = model.predict(X_test)
    modelName = model.__class__.__name__
    accModels[modelName] = accuracy / count
    predictions[modelName] = y_pred

    return y_pred, accuracy
Esempio n. 7
0
def train(xFile, yFile):
    with open(xFile, "rb") as file_r:
        X = pickle.load(file_r)
    X = reshape(X, (212841, -1))  # reshape一下 (212841, 30*128)

    # 读取label数据,并且One-Hot Encoding
    with open(yFile, "r") as yFile_r:
        labelLines = [_.strip("\n") for _ in yFile_r.readlines()]
    values = array(labelLines)
    labelEncoder = LabelEncoder()
    integerEncoded = labelEncoder.fit_transform(values)
    integerEncoded = integerEncoded.reshape(len(integerEncoded), 1)
    # print(integerEncoded)

    # 获得label one hot 编码
    Y = integerEncoded.reshape(212841, )
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.3,
                                                        random_state=42)

    # 决策树分类器
    clf = DT(criterion="entropy", splitter="random")
    # criterion 可以使用"gini"或者"entropy",前者代表基尼系数,后者代表信息增益。一般说使用默认的基尼系数"gini"就可以了,即CART算法。除非你更喜欢类似ID3, C4.5的最优特征选择方法。
    # splitter 可以使用"best"或者"random"。前者在特征的所有划分点中找出最优的划分点。后者是随机的在部分划分点中找局部最优的划分点。默认的"best"适合样本量不大的时候,而如果样本数据量非常大,此时决策树构建推荐"random"

    clf.fit(X_train, Y_train)

    # 测试数据
    predict = clf.predict(X_test)
    count = 0
    for p, t in zip(predict, Y_test):
        if p == t:
            count += 1
    print("Decison tree Accuracy is:", count / len(Y_test))
Esempio n. 8
0
	def iterate(self):
		print '-'*80
		print 'Running RUSBoost Iterations...'
		# performance by number of estimators and max depth
		results = []

		for ne in self.n_estimators_conf:
			for rr in self.rus_ratio_conf:
				print 'Iteration: nestimators=%s, rus_ratio=%s' % (str(ne), str(rr))
				m = rusBoost(base_learner=DT(max_depth=2), n_estimators=ne, rus_ratio=rr,
					   class_numsamples_dict=self.class_numsamples_dict)
				m.fit(self.xtrain, self.ytrain)
				predtrain = m.predict(self.xtrain)
				predtest = m.predict(self.xtest)
				predprobatrain = m.predict_proba(self.xtrain)
				predprobatest = m.predict_proba(self.xtest)
				accuracytrain = metrics.accuracy_score(predtrain, self.ytrain)
				accuracytest = metrics.accuracy_score(predtest, self.ytest)
				kstrain = multiclass_log_loss(self.ytrain, predprobatrain)
				kstest = multiclass_log_loss(self.ytest, predprobatest)
				cr = self.convert_cr(metrics.classification_report(self.ytest, predtest))
				results.append([ne, rr, accuracytrain, accuracytest, kstrain, kstest, cr])

		self.results = pd.DataFrame(results)
		self.results.columns = ['ne', 'rr', 'accuracy_train', 'accuracy_test',
						   'ks_train', 'ks_test', 'cr']
Esempio n. 9
0
def stat_on_train(model, train_set, val_set, is_using_val_set=True):
    """
    train a model with the train set and test on the validation set, 
    return the test results and model.
    :param str model: the classification model (DT, NB or KNN)
    :param list train_set: the training set instances
    :param list val_set: the validation set instances
    :param boolean is_using_val_set: if is_using_val_set is True, 
    the method will train the model using all the instances in 
    the training and validation set, and return the model; otherwise 
    it will just use the instances in the training set. 
    """
    if model == "DT":
        model = DT()
    elif model == "KNN":
        model = KNN()
    elif model == "NB":
        model = NB()
    else:
        exit()
    xtrain = np.array([[float(i) for i in v[:-1]] for v in train_set])
    ytrain = np.array([v[-1] for v in train_set])
    xtest = np.array([[float(i) for i in v[:-1]] for v in val_set])
    ytest = np.array([v[-1] for v in val_set])
    clf = model.fit(xtrain, ytrain)
    ypred = clf.predict(xtest)
    if is_using_val_set:
        clf = model.fit(np.concatenate((xtrain, xtest), axis=0), np.concatenate((ytrain, ytest), axis=0))
    return get_stat(ytest, ypred), clf
def fit_model(features, prices):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """

    # Create cross-validation sets from the training data to define how to split
    #and how many test runs of each split on data
    cv_sets = ShuffleSplit(n_splits=10, test_size=0.2, random_state=1)

    # TODO: Create a decision tree regressor object
    regressor = DT()

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}

    # Transform 'performance_metric' into a scoring function using 'make_scorer'
    scoring_fnc = make_scorer(performance_metric, greater_is_better=True)

    # Create the grid search object--RandomizedSearchCV is another option
    grid = GridSearchCV(regressor,
                        param_grid=params,
                        scoring=scoring_fnc,
                        cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(features, prices)

    # Return the optimal model after fitting the data
    return grid.best_estimator_
Esempio n. 11
0
def main():
    df = pd.read_csv('data.csv', index_col='id')
    df = my_preprocessing(df)

    data_X, data_y = df.drop('y', axis=1), df['y']
    
    train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size=0.25, random_state=0)

    tree = DT(random_state=0)
    parameters = {'max_depth':np.arange(2,11)}
    gcv = GridSearchCV(tree, parameters, cv=5, scoring='roc_auc', return_train_score=True, n_jobs=-1)
    gcv.fit(data_X, data_y)
    print(gcv.best_params_)
    best_tree = gcv.best_estimator_

    forest = RandomForestClassifier(random_state=0)
    parameters = {'max_depth':np.arange(2,11)}
    gcv = GridSearchCV(forest, parameters, cv=5, scoring='roc_auc', return_train_score=True, n_jobs=-1)
    gcv.fit(data_X, data_y)
    print(gcv.best_params_)
    best_forest = gcv.best_estimator_

    test_data = pd.read_csv('test.csv', index_col='id')
    test_data my_preprocessing(test_data)

    pred = best_forest.predict_proba(test_data)[:, 1]

    submit = pd.read_csv('../input/sample_submission.csv', header=None)
    submit[1] = pred
    submit.to_csv('submit.csv', index=False, header=False)
Esempio n. 12
0
def tune_params(feature_count):
    X_train, X_test, y_train, y_test = get_data(feature_count, 2)

    # model params
    params = {
        "criterion": ["mse", "mae"],  # use entropy
        "splitter": ["best", "random"],
        "max_depth": range(2, 21),
        "min_samples_split": range(2, 21),
        "min_samples_leaf": range(1, 21),
        "min_impurity_decrease": [0.0, 0.05, 0.1, 0.15, 0.2, 0.25]
    }

    # run randomized search
    n_iter_search = 60
    clf = RandomizedSearchCV(DT(),
                             param_distributions=params,
                             n_iter=n_iter_search,
                             n_jobs=-1)
    clf.fit(X_train, y_train)
    r2 = clf.score(X_test, y_test)

    print "\tBest result from Tunning: %d features, score of %.5f" % (
        X_train.shape[-1], r2)
    print clf.best_params_
Esempio n. 13
0
 def fit(self, X, y):
     X_train, X_val, y_train, y_val = train_test_split(X,
                                                       y,
                                                       test_size=0.25,
                                                       random_state=0)
     train_predict, val_predict = 0, 0
     # 按照二分类比例的初始化公式计算
     fit_val = np.log(y_train.mean() / (1 - y_train.mean()))
     next_fit_val = np.full(X_train.shape[0], fit_val)
     last_val_score = -np.infty
     for i in range(self.n_estimator):
         cur_booster = DT(max_depth=self.max_depth)
         cur_booster.fit(X_train, next_fit_val)
         train_predict += cur_booster.predict(X_train) * self.lr
         val_predict += cur_booster.predict(X_val) * self.lr
         next_fit_val = y_train - np.exp(train_predict) / (
             1 + np.exp(train_predict))
         self.booster.append(cur_booster)
         cur_val_score = self.record_score(y_train, y_val, train_predict,
                                           val_predict, i)
         if cur_val_score < last_val_score:
             self.best_round = i
             print("\n训练结束!最佳轮数为%d" % (i + 1))
             break
         last_val_score = cur_val_score
Esempio n. 14
0
 def fit(self, X, y):
     # 在数据集中划分训练集和验证集
     X_train, X_val, y_train, y_val = train_test_split(X,
                                                       y,
                                                       test_size=0.25,
                                                       random_state=0)
     train_predict, val_predict = 0, 0
     next_fit_val = np.full(X_train.shape[0], np.mean(y_train))
     # 为early_stop做记录准备
     last_val_score = np.infty
     for i in range(self.n_estimator):
         cur_booster = DT(max_depth=self.max_depth)
         cur_booster.fit(X_train, next_fit_val)
         train_predict += cur_booster.predict(X_train) * self.lr
         val_predict += cur_booster.predict(X_val) * self.lr
         # 平方损失为((y - (F_{m-1} + w)^2)/2,若记残差为r
         # 即为((r - w)^2)/2,此时关于w在0点处的负梯度求得恰好为r
         # 因此拟合的值就是y_train - train_predict
         next_fit_val = y_train - train_predict
         self.booster.append(cur_booster)
         cur_val_score = self.record_score(y_train, y_val, train_predict,
                                           val_predict, i)
         if cur_val_score > last_val_score:
             self.best_round = i
             print("\n训练结束!最佳轮数为%d" % (i + 1))
             break
         last_val_score = cur_val_score
Esempio n. 15
0
 def fit(self, X, y):
     self.model_list = []
     df = pd.DataFrame(X); df['label'] = y
     if len(df[df['label']==0]) > len(df[df['label']==1]):
     
         df_maj = df[df['label']==0]; n_maj = len(df_maj)
         df_min = df[df['label']==1]; n_min = len(df_min)
     else:
         df_maj = df[df['label']==1]; n_maj = len(df_maj)
         df_min = df[df['label']==0]; n_min = len(df_min)
     
     cols = df.columns.tolist(); cols.remove('label')
     for ibagging in range(self.n_estimators):
         b = min(0.1*((ibagging%10)+1), 1)
         train_maj = df_maj.sample(frac=b, replace=True)
         train_min = df_min.sample(frac=b, replace=True)
         # train_maj = df_maj.sample(frac=1/self.n_estimators, replace=True)
         # train_min = df_min.sample(frac=1/self.n_estimators, replace=True)
         # train_maj = df_maj.sample(n=n_min, replace=True)
         # train_min = df_min.sample(frac=1/self.n_estimators, replace=True)
         df_k = train_maj.append(train_min)
         X_train, y_train = SMOTE_IMB(k_neighbors=min(5, len(train_min)-1)).fit_resample(df_k[cols], df_k['label'])
         # print ('Bagging Iter: {} |b: {:.1f}|n_train: {}|n_smote: {}'.format(
         #     ibagging, b, len(y_train), len(y_train)-len(df_k)))
         model = DT().fit(X_train, y_train)
         self.model_list.append(model)
     return self
Esempio n. 16
0
    def fit(self, user_i):
        datasets = []
        DTs = []
        s = str(user_i) + '_'
        with open("out", 'a') as standardout:
            print("[Fitting]\n", file=standardout)

        for i in range(self.n_trees):
            data_indeces = np.random.randint(0, self.X.shape[0],
                                             self.X.shape[0])
            y_indeces = np.random.randint(
                0, self.X.shape[1],
                np.random.randint(1, self.X.shape[1], 1)[0])
            temp_d = DT(criterion='entropy')
            temp_d.fit(
                self.X[data_indeces,
                       y_indeces].reshape(data_indeces.shape[0],
                                          y_indeces.shape[0]),
                self.y[data_indeces])
            DTs.append((temp_d, y_indeces))
        dts = []
        for i in range(len(DTs)):
            t_file_name = s + str(i) + '.pkl'
            d_temp_file = open('/dev/core/files/' + t_file_name, 'wb')
            pickle.dump(DTs[i], d_temp_file)
            dts.append(t_file_name)
        return dts
Esempio n. 17
0
 def __init__(self, base_estimator=DT(), n_estimators=10, random_seed=None):
     self.base_estimator = base_estimator
     self.n_estimators = n_estimators
     self.random_seed = random_seed
     self.model_list = []
     # Will be set in the fit function
     self.feature_cols = None
Esempio n. 18
0
def train_valid_dt(source1, source2):
    """ 决策树,就是使用这里的代码 """
    X_train, X_test, y_train, y_test = getData(const.DATAPATH, source1)
    print('starting...')
    stime = time.time()
    clf = DT(random_state=10)
    clf.fit(X_train, y_train)

    tree_text = export_text(clf,
                            feature_names=X_train.columns.values.tolist(),
                            max_depth=20)
    print('Tree Structure : ')
    print(tree_text)

    with open(os.path.join(const.DATAPATH,
                           'dt_structure_{}.txt'.format(source)),
              'w',
              encoding='utf-8',
              errors='ignore') as f:
        f.write(tree_text)

    print('Feature importance : ')
    print(clf.feature_importances_)
    print('Time cost {:.2f} ||| Score={:.4f}'.format(
        (time.time() - stime) / 60, clf.score(X_test, y_test)))

    valid(clf, const.DATAPATH, source2)

    return clf
 def setUp(self):
     self.tmp_fn = 'Tmp'
     self.iris = load_iris()
     self.n_features = len(self.iris.data[0])
     base_estimator = DT(max_depth=4, random_state=0)
     self.clf = ADA(base_estimator=base_estimator, n_estimators=100, random_state=0)
     self.clf.fit(self.iris.data, self.iris.target)
Esempio n. 20
0
	def iterate(self):
		print '-'*80
		print 'Running SMOTEBoost Iterations...'
		# performance by number of estimators and max depth
		results = []

		for ne in self.n_estimators_conf:
			for sr in self.smote_ratio_conf:
				print 'Iteration: nestimators=%s, smote_ratio=%s' % (str(ne), str(sr))
				m = SB(base_learner=DT(max_depth=2), n_estimators=ne, smote_ratio=sr,
					   class_numsamples_dict=class_numsamples_dict, df_smote=df_smote)
				m.fit(self.xtrain, self.ytrain)
				predtrain = m.predict(self.xtrain)
				predtest = m.predict(self.xtest)
				predprobatrain = m.predict_proba(self.xtrain)
				predprobatest = m.predict_proba(self.xtest)
				accuracytrain = metrics.accuracy_score(predtrain, self.ytrain)
				accuracytest = metrics.accuracy_score(predtest, self.ytest)
				kstrain = multiclass_log_loss(self.ytrain, predprobatrain)
				kstest = multiclass_log_loss(self.ytest, predprobatest)
				results.append([ne, sr, accuracytrain, accuracytest, kstrain, kstest])

		self.results = pd.DataFrame(results)
		self.results.columns = ['ne', 'sr', 'accuracy_train', 'accuracy_test',
						   'ks_train', 'ks_test']
Esempio n. 21
0
	def __init__(self, base_learner=DT(max_depth=2),
				 n_estimators=3,
				 rus_ratio=1.0,
				 class_numsamples_dict=False):
		self.m = base_learner
		self.T = n_estimators
		self.rus_ratio = rus_ratio
		self.class_numsamples_dict = class_numsamples_dict
Esempio n. 22
0
    def fit(x, y, max_depth=5):
        classifier = DTClassifier()
        classifier.max_depth = max_depth

        classifier.clf = DT(max_depth=classifier.max_depth)

        classifier.clf.fit(x, y)
        return classifier
 def NLMmodelexp1():
     modelExperiment(
         nlmInsampleData, nlmOutsampleData, 'NLMdata/', fullFV,
         [LR(), DT(), KNC(), RF(),
          ABC(), GNB(), QDA()], [
              'LogisticRegression', 'DTree', 'KNN', 'RandomForest',
              'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis'
          ], 'NLMmodelExperiment1.csv', 'NLMclassifier_plot1.png', True)
Esempio n. 24
0
	def __init__(self,
				 base_learner=DT(max_depth=2),
				 n_estimators=3,
				 synthetic_data=None,
				 synthetic_ratio=1.0):
		self.m = base_learner
		self.T = n_estimators
		self.df_synthetic = synthetic_data
		self.synthetic_ratio = synthetic_ratio
Esempio n. 25
0
def tree(labels, X, df, i):
    tree = DT(max_depth=4)
    tree.fit(X, labels)
    impt = tree.feature_importances_
    para = tree.get_params()
    export_graphviz(tree,
                    out_file=OUTPUT_DIRECTORY + str(i) + "_tree.dot",
                    feature_names=df.columns)
    return impt
Esempio n. 26
0
 def __init__(self, task='spam'):
     super(TaskTrainer, self).__init__()
     from sklearn.svm import SVC as SVM
     self.task = task
     if task == 'vehicle':
         self.env = SVM(C=1e2, kernel='rbf', random_state=0) # For vehicle task
     elif task == 'page':
         self.env = SVM(C=1e2, kernel='rbf', random_state=0, gamma=1e-2) # For page blocks
     elif task == 'credit':
         self.env = DT(max_depth=4) # For credit card task
     elif task == 'spam':
         self.env = LogisticRegression(C=1e2, random_state=0) # For spam detection task
Esempio n. 27
0
def DT_classif():
    # Decision Tree
    # http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
    # sklearn.tree.DecisionTreeClassifier(criterion=’gini’, splitter=’best’, max_depth=None,
    #     min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None,
    #     random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
    #     class_weight=None, presort=False)
    hypers = {
        'max_depth': 5,
        'class_weight': 'balanced',
    }
    return DT(**hypers)
Esempio n. 28
0
	def __init__(self, base_learner=DT(max_depth=2),
				 n_estimators=3,
				 smote_ratio=10,
				 class_numsamples_dict=False,
				 df_smote=False,
				 smote_decay='linear'):
		self.m = base_learner
		self.T = n_estimators
		self.smote_ratio = smote_ratio
		self.class_numsamples_dict = class_numsamples_dict
		self.df_smote = df_smote
		self.smote_decay = smote_decay
Esempio n. 29
0
    def fit(self, user_i):
        datasets = []
        DTs = []
        s = str(user_i) + '_'
        z = time.time()
        with open("out", 'a') as standardout:
            print("[Fitting]", file=standardout)

        for i in range(self.n_trees):
            data_indeces = np.random.randint(0, self.X.shape[0],
                                             self.X.shape[0])
            y_indeces = np.random.randint(
                0, self.X.shape[1],
                np.random.randint(1, self.X.shape[1], 1)[0])
            temp_d = DT(criterion='entropy')
            temp_d.fit(
                self.X[data_indeces,
                       y_indeces].reshape(data_indeces.shape[0],
                                          y_indeces.shape[0]),
                self.y[data_indeces])
            DTs.append((temp_d, y_indeces))
        dts = []
        #filestart = time.time()
        #q = Queue()
        #proc = []
        for i in range(len(DTs)):
            t_file_name = s + str(i) + '.pkl'
            d_temp_file = open(t_file_name, 'wb')
            pickle.dump(DTs[i], d_temp_file)
            dts.append(t_file_name)
            #p = Process(target=file_dumper,args=(q,t_file_name,DTs[i]))
            #p.start()
            #proc.append(p)
        pickled = []
        for i in dts:
            with open(i, 'r') as pklfile:
                pickled.append(pklfile.read())
        '''    
        for i in range(len(DTs)):
             proc[i].join()
             dts.append(q.get()) ####DANGER
             #d_temp_file.close()
        
        filestop = time.time()
        v = time.time()
        with open("out",'a') as standardout:
            print("FIT TIME",v-z,file=standardout)
        with open("out",'a') as standardout:
            print("FIT COMPLETE",file=standardout)
        #return DTs
        '''
        return dts, pickled
 def SOmodelexp1():
     modelExperiment(
         SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV,
         [LR(),
          DT(),
          KNC(),
          RF(n_estimators=200),
          ABC(),
          GNB(),
          QDA()], [
              'LogisticRegression', 'DTree', 'KNN', 'RandomForest',
              'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis'
          ], 'SOmodelExperiment1.csv', 'SOclassifier_plot1.png', True)