コード例 #1
0
def test_sample_regular_wrong_svm():
    kind = 'svm'
    nn_k = NearestNeighbors(n_neighbors=6)
    svm = 'rnd'
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm)

    with raises(ValueError, match="has to be one of"):
        smote.fit_sample(X, Y)
コード例 #2
0
def test_fit_sample_nn_obj():
    """Test sample with NN object provided."""

    # Create the object
    kind = 'borderline1'
    nn_m = NearestNeighbors(n_neighbors=11)
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.3765279, -0.2009615], [0.55276636, -0.10550373],
                     [0.45413452, -0.08883319], [1.21118683, -0.22817957]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #3
0
def test_sample_regular_with_nn_svm():
    """Test sample function with regular SMOTE with a NN object."""

    # Create the object
    kind = 'svm'
    nn_k = NearestNeighbors(n_neighbors=6)
    svm = SVC(random_state=RND_SEED)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.47436888, -0.2645749], [1.07844561, -0.19435291],
                     [1.44015515, -1.30621303]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #4
0
    def fit(self, X , y = None):
        # 'Random under-sampling'
        # CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51)
        #Accuracy: 0.939693267481
        #Precision: 0.238095238095
        #Recall: 0.897435897436

        #Accuracy: 0.962568234988
        #Precision: 0.324468085106
        #Recall: 0.782051282051
        #SMOTE(ratio=ratio, kind='borderline1')
        #Accuracy: 0.971146347803
        #Precision: 0.372093023256
        #Recall: 0.615384615385
        #SMOTE(ratio=ratio, kind='borderline2')
        #Accuracy: 0.965427605927
        #Precision: 0.333333333333
        #Recall: 0.705128205128
        #svm_args = {'class_weight': 'auto'}
        #svmsmote = SMOTE(ratio=ratio, kind='svm', **svm_args)
        #Accuracy: 0.972186119054
        #Precision: 0.395683453237
        #Recall: 0.705128205128

        smote = SMOTE(ratio='auto', kind='regular')
        X, y = smote.fit_sample(X, y)
       # weights = np.array([1/y.mean() if i == 1 else 1 for i in y])
        return super(RandomForestClassifier, self).fit(X,y)#,sample_weight=weights)
コード例 #5
0
def train(addr_train, clf, sampling, add_estimators):
    with open(os.path.join(addr_train, "day_samp_bin.npy"), "r") as file_in:
        X = smio.load_sparse_csr(file_in)
    width = np.size(X, 1)
    X_train = X[:, :width-1]
    y_train = X[:, width-1]
    if sampling == "Over":
        sm = SMOTE(ratio=0.95)
        X_train, y_train = sm.fit_sample(X_train, y_train)
    elif sampling == "Under":
        X_train, y_train = US.undersample(X, 0.01)

    print "Fitting Model......"
    clf.n_estimators += add_estimators
    clf.fit(X_train, y_train)
    print "Done"

    if __SAVE_MODEL:
        model_name = "RF_" + onoff_line + "_" + sampling + "_Model.p"
        dir_out = os.path.join(addr_train, "Random_Forest_Models")
        if not os.path.isdir(dir_out):
            os.mkdir(dir_out)
        path_out = os.path.join(dir_out, model_name)
        with open(path_out, "w") as file_out:
            pickle.dump(clf, file_out)

    return clf
コード例 #6
0
def test_sample_borderline2():
    """Test sample function with borderline 2 SMOTE."""

    # Create the object
    kind = 'borderline2'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.47436888, -0.2645749], [1.07844561, -0.19435291],
                     [0.33339622, 0.49870937]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #7
0
ファイル: Bernoulli_Simple.py プロジェクト: Shurooo/gumgum
def get_data(month, day, hour=-1, mode="normal"):
    if hour != -1:
        if hour == 24:
            hour = 0
            day += 1
        addr_in = os.path.join("/mnt/rips2/2016",
                               str(month).rjust(2, "0"),
                               str(day).rjust(2, "0"),
                               str(hour).rjust(2, "0"),
                               "output_bin.npy")
    else:
        addr_in = os.path.join("/mnt/rips2/2016",
                               str(month).rjust(2, "0"),
                               str(day).rjust(2, "0"),
                               "day_samp_newer_bin.npy")
    with open(addr_in, "r") as file_in:
        loader = np.load(file_in)
        data = csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']).toarray()
    X = data[:, :-1]
    y = data[:, -1]

    if mode == "over":
        sm = SMOTE(ratio=0.99, verbose=0)
        X, y = sm.fit_sample(X, y)

    return X, y
コード例 #8
0
def resample_data(X, y, categorical_lst):
    '''
    up-samples minority class
    '''
    sm = SMOTE(kind='regular')
    X_train_re, y_train_re = sm.fit_sample(X,y)
    #rounding categorical variables
    X_train_re[:,categorical_lst] = np.round(X_train_re[:,categorical_lst])
    return X_train_re, y_train_re
def Input_Preparing(Scaled_Input_Data, Surgery_Outcome, N_Feat):
    # Feature Selection
    MIFS = mifs.MutualInformationFeatureSelector(method='JMI', verbose=2, n_features = N_Feat)
    MIFS.fit(Scaled_Input_Data, Surgery_Outcome)
    Selected_Input_Data = Scaled_Input_Data.loc[:,MIFS.support_]

    # Balancing using SMOTE
    sm = SMOTE(kind='regular')
    Prep_Train_Data, Prep_Surgery_Outcome = sm.fit_sample(X, y)
    
    return(Prep_Train_Data, Prep_Surgery_Outcome, MIFS.support_)
コード例 #10
0
def SMT(df, target):
    df1 = df.copy()
    y = df1.pop('anti_churn')
    X = df1
    Xcols = df1.columns
    sm = SMOTE(kind='regular', ratio = target)
    X_resampled, y_resampled = sm.fit_sample(X, y)
    X_resampled = pd.DataFrame(X_resampled)
    y_resampled = pd.DataFrame(y_resampled)
    X_resampled.columns = Xcols
    y_resampled.columns = ['anti_churn']
    return X_resampled, y_resampled
コード例 #11
0
ファイル: transform.py プロジェクト: alexeyche/alexeyche-junk
    def transform(self, fp):
        fm, train_x, train_y = FeaturePool.to_train_arrays(fp)

        os = SMOTE(random_state = self.random_state)
        os_train_x, os_train_y = os.fit_sample(train_x, train_y[:, 0])
        os_train_y = os_train_y.reshape((os_train_y.shape[0], 1))

        for f in FeaturePool.from_train_arrays(fm, os_train_x, os_train_y):
            yield Feature.apply_config(f, is_over_sampled=True)
        for f in fp:
            if f.split_type == SplitType.TEST:
                yield f
コード例 #12
0
    def oversample(self):
        """Balance class data based on outcome"""
        print('Current outcome sampling {}'.format(Counter(self.y)))
        
        # to use a random sampling seed at random:
        #ros = RandomOverSampler()
        ros = SMOTE()
        #ros = ADASYN()

        self.X, self.y = ros.fit_sample(self.X, self.y)

        self.Xview = self.X.view()[:, :self.n_features]
        print('Resampled dataset shape {}'.format(Counter(self.y)))
コード例 #13
0
ファイル: oversample.py プロジェクト: brettin/pilot1-docs
def oversample(X, y, bal_strategy):

	if(bal_strategy == "SMOTESVN"  or bal_strategy == "ALL"):
		# Apply SMOTE SVM
		sm = SMOTE(kind='svm')
		X_sampled, y_sampled = sm.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "SMOTE"  or bal_strategy == "ALL"):
		# Apply regular SMOTE
		sm = SMOTE(kind='regular')
		X_sampled, y_sampled = sm.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "ADASYN"  or bal_strategy == "ALL"):
	# Apply the random over-sampling
		ada = ADASYN()
		X_sampled, y_sampled = ada.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == 'NONE'):
		X_sampled = X
		y_sampled = y

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	else:
		print 'bal_stragegy not in SMOTESVN, SMOTE, ADASYN, ALL, NONE'
		sys.exit(1)


	return (X_sampled, y_sampled)
コード例 #14
0
def test_sample_regular():
    """Test sample function with regular SMOTE."""

    # Create the object
    kind = 'regular'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #15
0
ファイル: Bernoulli_CorEx.py プロジェクト: Shurooo/gumgum
def train(cutoffs):
    print "\n========== Start Training =========="
    if __DATA_FROM == 2:
        list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1])
    else:
        list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1])
    clf = BernoulliNB(class_prior=[0.05, 0.95])

    if __IF_TRAIN_WITHOUT_SAVE:
        print "Performing correlation explanation......"
        with open("/home/wlu/Desktop/day_samp_bin_1-2.npy", "r") as file_in:
            X = Sparse_Matrix_IO.load_sparse_csr(file_in)
            if len(cutoffs) > 0:
                X = discard_vars(X, cutoffs)
            layer = correlation_ex(X)

    for i in range(0, len(list_io_addr)):
        path_in = list_io_addr[i]
        print "\nGenerating training set from {}".format(path_in)
        with open(path_in, "r") as file_in:
            X = Sparse_Matrix_IO.load_sparse_csr(file_in)

        if len(cutoffs) > 0:
            X = discard_vars(X, cutoffs)

        vector_len = len(X[0])
        X_train = X[:, 0:vector_len-1]
        y_train = X[:, vector_len-1]

        if __IF_TRAIN_WITHOUT_SAVE:
            print "Transforming training set according to CorEx......"
            X_train = corex_transform(layer, X_train)

        sm = SMOTE(ratio=0.95)
        X_train, y_train = sm.fit_sample(X_train, y_train)

        print "Fitting Model......"
        clf.partial_fit(X_train, y_train, classes=[0, 1])
        print "Done"

    if __IF_TRAIN_WITHOUT_SAVE:
        return [clf, layer]
    else:
        with open(__ROOT_MODEL, "w") as file_out:
            pickle.dump(clf, file_out)
        return []
コード例 #16
0
def get_data(ratio, sampling):
    list_io_addr = get_io_addr()
    data = []
    for addr_in in list_io_addr:
        with open(addr_in, "r") as file_in:
            X = smio.load_sparse_csr(file_in)
            data.extend(X)
    data = np.array(data)

    n = 30000
    if sampling == "Over":
        m = int(np.size(data, 1))
        k = int(0.8*n)
        X = data[:n, :m-1]
        y = data[:n, m-1:]
        X_train = X[:k, :]
        y_train = y[:k]
        sm = SMOTE(ratio=ratio)
        X_train, y_train = sm.fit_sample(X_train, column_or_1d(y_train, warn=False))
        X_test = X[k:, :]
        y_test = y[k:]
    elif sampling == "None":
        m = int(np.size(data, 1))
        k = int(0.8*n)
        X = data[:n, :m-1]
        y = data[:n, m-1:].ravel()
        X_train = X[:k, :]
        y_train = y[:k]
        X_test = X[k:, :]
        y_test = y[k:]
    else:
        m = int(np.size(data, 1))
        k = int(0.2*np.size(data, 0))
        data_test = data[k:, :]
        data = data[:k, :]
        data = US.undersample(data, ratio)
        k = int(0.8*np.size(data, 0))
        if np.size(data_test, 0) > k:
            data_test = data[:k, :]
        X_train = data[:, :m-1]
        y_train = data[:, m-1:].ravel()
        X_test = data_test[:, :m-1]
        y_test = data_test[:, m-1:].ravel()
    return X_train, y_train, X_test, y_test
コード例 #17
0
def clf_extratree_predictor(item):
    (clf_args,idx,X,y,use_SMOTE) = item
    train_index, test_index = idx

    clf = sklearn.ensemble.ExtraTreesClassifier(**clf_args)
        
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    if use_SMOTE:
        sampler = SMOTE(ratio='auto', kind='regular')
        X_train, y_train = sampler.fit_sample(X_train,y_train)
    
    clf.fit(X_train,y_train)
    
    pred   = clf.predict(X_test)
    pred_proba = clf.predict_proba(X_test)

    return idx,pred,pred_proba
コード例 #18
0
def train_and_test_dnn(args):
    
    for a in args:
        print(a)
    
    primitive = args[1]
    res =  pickle.load(open(sys.argv[2], "rb" ))
    notes_with_truth_labels_for_query_primitives = pd.read_csv(args[3])
   
    dl_results = pd.DataFrame(columns = ['primitive', 'avg_fit_time', 'avg_score_time', 'avg_score'])
    
    X = get_doc_term_matrix(res)
    y = notes_with_truth_labels_for_query_primitives.loc[:, primitive]

    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 5, 2), random_state=1)

    try:

        sm = SMOTE(random_state=357)
        X_sm, y_sm = sm.fit_sample(X, y)

    except ValueError:
        print("value error, smote")
        X_sm = X
        y_sm = y

    cv_results = cross_validate(clf, X_sm, y_sm, cv=3, return_train_score=False)
    print(cv_results)

    dump(clf, './models/{}_trained_dnn.joblib'.format(primitive)) 

    dl_results.loc[0, 'primitive'] = primitive
    dl_results.loc[0, 'avg_fit_time'] = np.mean(cv_results['fit_time'])
    dl_results.loc[0, 'avg_score_time'] = np.mean(cv_results['score_time'])
    dl_results.loc[0, 'avg_test_score'] = np.mean(cv_results['test_score'])

    with open(args[4], 'a') as f:
        f.write("{}, {}, {}, {}\n".format(dl_results.loc[0,'primitive'], dl_results.loc[0,'avg_fit_time'], dl_results.loc[0,'avg_score_time'], dl_results.loc[0,'avg_test_score']))
        #f.write(dl_results.loc[0,:])
        #f.write("\n")
        f.close()
    
    print("DONE w/ {}".format(primitive))
コード例 #19
0
def test_sample_regular_half():
    ratio = {0: 9, 1: 12}
    kind = 'regular'
    smote = SMOTE(ratio=ratio, random_state=RND_SEED, kind=kind)
    X_resampled, y_resampled = smote.fit_sample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.36784496, -0.1953161]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
コード例 #20
0
def test_sample_borderline1():
    kind = 'borderline1'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    X_resampled, y_resampled = smote.fit_sample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.3765279, -0.2009615], [0.55276636, -0.10550373],
                     [0.45413452, -0.08883319], [1.21118683, -0.22817957]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
コード例 #21
0
def test_sample_regular_with_nn():
    kind = 'regular'
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k)
    X_resampled, y_resampled = smote.fit_sample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668], [0.66052536, -0.28246517]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
コード例 #22
0
ファイル: LearningCurve.py プロジェクト: Shurooo/gumgum
def DataFormat(data):
    Data = smio.load_sparse_csr(data)
    m = int(np.size(Data, 1))
    n = int(np.size(Data, 0))
    X_train = Data[:50000, :m-1]
    y_train = Data[:50000, m-1]
    sm = SMOTE(ratio=0.95)
    X_train, y_train = sm.fit_sample(X_train, y_train)

    data_new = []
    for i in range(np.size(X_train, 0)):
        row = list(X_train[i].tolist())
        row.append(y_train[i])
        data_new.append(row)
    shuffle(data_new)
    data_new = np.array(data_new)
    m = int(np.size(data_new, 1))
    X_train = data_new[:, :m-1]
    y_train = data_new[:, m-1]

    K = np.count_nonzero(y_train)   # Number of good data points
    return X_train, y_train, n, K   # Training set plus some numbers useful for weighting
コード例 #23
0
ファイル: Get_Data_Rodrigo.py プロジェクト: Shurooo/gumgum
def get(addr_day, mode="normal", ratio=-1, sampling_method="None", bin=False):
    if "res" in mode:
        res_ratio = mode.split("-")[1]
        prefix = "day_samp_res"
        suffix = "_{}.npy".format(res_ratio)
        res = "Reservoir_Data"
    else:
        prefix = "day_samp_new"
        suffix = ".npy"
        res = ""

    if not ratio == -1:
        n = 100000
        neg = int(n / (1+ratio))
        pos = n - neg

        with open(os.path.join(addr_day, "PosNeg", res, prefix + "_neg" + suffix), "r") as file_neg:
            matrix_neg = smio.load_sparse_csr(file_neg)
        matrix_neg = matrix_neg[:neg, :]
        with open(os.path.join(addr_day, "PosNeg", res, prefix + "_pos" + suffix), "r") as file_pos:
            matrix_pos = smio.load_sparse_csr(file_pos)
        matrix_pos = matrix_pos[:pos, :]

        matrix = vstack((matrix_neg, matrix_pos))
        np.random.shuffle(matrix)
    else:
        with open(os.path.join(addr_day, res, prefix + suffix), "r") as file_in:
            matrix = smio.load_sparse_csr(file_in)

    width = np.size(matrix, 1)
    X = matrix[:, :width-1]
    y = matrix[:, width-1]

    if "Over" in sampling_method:
        sm = SMOTE(ratio=0.95)
        X, y = sm.fit_sample(X, y)

    return X, y
コード例 #24
0
def test_wrong_nn():
    kind = 'borderline1'
    nn_m = 'rnd'
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m)
    with raises(ValueError, match="has to be one of"):
        smote.fit_sample(X, Y)
    nn_k = 'rnd'
    nn_m = NearestNeighbors(n_neighbors=10)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m)
    with raises(ValueError, match="has to be one of"):
        smote.fit_sample(X, Y)
    kind = 'regular'
    nn_k = 'rnd'
    smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k)
    with raises(ValueError, match="has to be one of"):
        smote.fit_sample(X, Y)
コード例 #25
0
            Feature_test = np.concatenate(
                (Positive_Features_test, Negative_Features_test))
            Label_test = np.concatenate(
                (Positive_Labels_test, Negative_Labels_test))
            #                print(Label_test)

            clf = xgb.XGBClassifier()
            if m == "xGBoost":
                Feature_train = Features_train_o
                Label_train = Labels_train_o
                clf.fit(Feature_train, Label_train)
                Label_predict = clf.predict(Feature_test)
                Label_score = clf.predict_proba(Feature_test)
            elif m == "SMOTE":
                sm = SMOTE()
                Feature_train, Label_train = sm.fit_sample(
                    Features_train_o, Labels_train_o)
                clf.fit(Feature_train, Label_train)
                Label_predict = clf.predict(Feature_test)
                Label_score = clf.predict_proba(Feature_test)
            elif m == "Bayesian":
                bayes = BayesianNetwork.from_json(bayes_name)
                Negative_Features_train_prob = bayes.probability(
                    Negative_Features_train)
                Positive_Features_train_prob = np.zeros(
                    (Num_Positive_train, 1))
                for k in range(Num_Positive_train):
                    try:
                        Positive_Features_train_prob[k] = bayes.probability(
                            Positive_Features_train[k])
                    except KeyError:
                        Positive_Features_train_prob[k] = 0
コード例 #26
0
##test train sets
train, test = train_test_split(full_normalized,
                               test_size=0.2,
                               random_state=123)

x_train = np.array(train.drop(
    'BK', axis=1))  #needed as arrays so that we can "ravel"
y_train = np.array(train.loc[:, ['BK']])

x_test = np.array(test.drop('BK', axis=1))
y_test = np.array(test.loc[:, ['BK']])

#Smote Data
sm = SMOTE(random_state=123)
x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train.ravel())

##--------------------------------------------------------------------------
#RandomForest
rfc = RandomForestClassifier(random_state=123)

#parameters
param_grid = {
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy']
}

#scoring
scoring = {
コード例 #27
0
    # 读取数据
    data = load_svmlight_file(args.datapath)

    # 设定分类信息和特征矩阵
    X, y = data[0], data[1]
    print("\nDataset shape: ", X.shape, " Number of features: ", X.shape[1])
    # 不同 Class 统计 (根据 Target 列)
    num_categories = np.unique(y).size
    sum_y = np.asarray(np.unique(y.astype(int), return_counts=True))
    df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None)
    print('\n', df_sum_y)

    # Apply SMOTE 生成 fake data
    sm = SMOTE(k_neighbors=2)
    x_resampled, y_resampled = sm.fit_sample(X, y)
    # after over sampleing 读取分类信息并返回数量
    np_resampled_y = np.asarray(
        np.unique(y_resampled.astype(int), return_counts=True))
    df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum'])
    print("\nNumber of samples after over sampleing:\n{0}".format(
        df_resampled_y))

    # 初始化 classifier
    clf = SVC(kernel=args.kernel,
              gamma=args.gamma,
              C=args.c,
              max_iter=args.max_iter,
              random_state=args.randomseed)
    print("\nClassifier parameters:")
    print(clf.get_params())
コード例 #28
0
def smote(x, y):
    print("----SMOTE----")
    sampler = SMOTE(random_state=42)
    X, y = sampler.fit_sample(x, y)
    return X, y
コード例 #29
0
 def fit(self, X , y = None):
     smote = SMOTE(ratio='auto', kind='regular')
     X, y = smote.fit_sample(X, y)
   #  weights = np.array([1/y.mean() if i == 1 else 1 for i in y])
     return super(AdaBoostClassifier, self).fit(X,y) #,sample_weight=weights)
コード例 #30
0
#%%
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

import utils

#%%
# prepare data
x,y = utils.load_data_as_df('dataset/train.data')
smote = SMOTE(sampling_strategy=0.5, random_state=100)
x, y = smote.fit_sample(x, y)

#%%
# tuning parameters
# C是加在损失函数前面的,而不是加在正则项前面,因此C越大,表示对系数的惩罚力度越小,模型越容易过拟合

params = {'C':[0.1, 1, 5, 10, 13, 15, 20, 25, 30],
          'solver':['liblinear','sag','lbfgs','newton-cg']
         }

k = 5
clf = GridSearchCV(LogisticRegression(random_state=10, max_iter=1000), params, scoring='f1', n_jobs=-1, cv=k)
search = clf.fit(x, y)

results = search.cv_results_
print('best prarams', search.best_params_)
コード例 #31
0
        cbar=False)
    plt.xlabel("true label")
    plt.ylabel("predicted label")
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)


plot_conf_mat(y_test, y_preds)

#Plot ROC curve and calculate Auc metric
plot_roc_curve(svm_model, test_matrix, y_test)

# applying SMOTE to balance the data
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train1, y_train1 = sm.fit_sample(train_matrix, y_train)
y_train1.value_counts()
y_train.value_counts()

svm_model = LinearSVC()
svm_model.fit(X_train1, y_train1)
train_pred_svc = svm_model.predict(X_train1)
accuracy_train_svc = np.mean(train_pred_svc == y_train1)

y_preds = svm_model.predict(test_matrix)
accuracy_test_svc = np.mean(y_preds == y_test)

print(classification_report(y_test, y_preds))
pd.crosstab(y_test, y_preds)

# Applying k-Fold Cross Validation
コード例 #32
0
    'solver': ['lbfgs', 'sag'],
    'class_weight': ['balanced'],
    'penalty': ['l2'],
    'C': [.1, .01, .001, .001, .2, .02, .002],
    'multi_class': ['multinomial', 'auto']
}

gs = GridSearchCV(model, param_grid=lr_params)
gs.fit(train_data_tfid, y_train)

gs.best_params_

#Modeling (Smote and Regularization)

sm = SMOTE()
x_reb, y_reb = sm.fit_sample(train_data_tfid, y_train)

model = LogisticRegression(C=.001,
                           multi_class='multinomial',
                           penalty='l2',
                           solver='sag')
model.fit(x_reb, y_reb)

print(model.score(x_reb, y_reb))
print(model.score(test_data_tfid, y_test))

predictions = model.predict(test_data_tfid)

pd.DataFrame(predictions, y_test)

# Creating pipeline and pickleing
コード例 #33
0
from preprocess.load_data.data_loader import load_production
production_tb = load_production()

# 下の行から本書スタート
# SMOTE関数をライブラリから読み込み
from imblearn.over_sampling import SMOTE

# SMOTE関数の設定
# ratioは不均衡データにおける少ない例のデータを多い方のデータの何割まで増やすか設定
# (autoの場合は同じ数まで増やす、0.5と設定すると5割までデータを増やす)
# k_neighborsはsmoteのkパラメータ
# random_stateは乱数のseed(乱数の生成パターンの元)
sm = SMOTE(ratio='auto', k_neighbors=5, random_state=71)

# オーバーサンプリング実行
blance_data, balance_target = \
  sm.fit_sample(production_tb[['length', 'thickness']],
                production_tb['fault_flg'])
コード例 #34
0
ファイル: main_d.py プロジェクト: tongzhang888/DSCI_552
                                 needImpute=True,
                                 dropOrNot=True)

    X_test, y_test = load_data(ROOT_PATH + APS_TEST,
                               skip_first_row=21,
                               y_column_index=0,
                               assignedColumnNames=APS_FULL_COLUMNS,
                               missingSymbol='na',
                               needImpute=True,
                               dropOrNot=True)

    y_test = to_binary_numeric(y_test, classNeg="neg")
    y_train = to_binary_numeric(y_train, classNeg="neg")

    smote = SMOTE(random_state=2333)
    smote_train_fit = smote.fit_sample(X_train, y_train)
    smote_test_fit = smote.fit_sample(X_test, y_test)
    X_train_smote = pd.DataFrame(smote_train_fit[0])
    y_train_smote = pd.DataFrame(smote_train_fit[1], columns=['class'])
    X_test_smote = pd.DataFrame(smote_test_fit[0])
    y_test_smote = pd.DataFrame(smote_test_fit[1], columns=['class'])

    print("-----------\"After Using SMOTE: (Train)\"-------------")
    print(y_train_smote['class'].value_counts())
    print("-----------\"After Using SMOTE: (Test)\"-------------")
    print(y_test_smote['class'].value_counts())

    randForestClf = RandomForestClassifier(n_estimators=50,
                                           random_state=2333,
                                           oob_score=True)
    randForestClf.fit(X_train_smote, y_train_smote)
コード例 #35
0

#traindata=pd.get_dummies(train)
#x_train=traindata.drop('renewal',axis=1)
#y_train=traindata['renewal']

oversampler=SMOTE(random_state=0, ratio=1)

rocm=[1,2,3,4,5,6,7,8,9,10]
num_folds = 10
subset_size = math.floor(len(traindata)/num_folds)
for i in range(num_folds):
    print(i)
    training = traindata[:i*subset_size]. append(traindata[(i+1)*subset_size:])
    train_data=training.drop(['id','renewal'], axis=1)
    sm_x,sm_y=oversampler.fit_sample(train_data,training['renewal'])
    
    #print(sm_y.sum()/len(sm_y))
    lgtrain=lgb.Dataset(sm_x,label=sm_y)
    clf = lgb.train(params, lgtrain, 700)

    
    testing = traindata[i*subset_size:][:subset_size]
    testd=testing.drop(['renewal','id'], axis=1)
    testd=pd.get_dummies(testd)
    
    lgbpred=clf.predict(testd)
    for x in range(0,len(lgbpred)):
        if lgbpred[x]>=.5:       # setting threshold to .5
           lgbpred[x]=1
        else:  
コード例 #36
0
ファイル: XGBoost.py プロジェクト: HongqiaoChen/DataMining
from imblearn.over_sampling import SMOTE
from sklearn import model_selection
import xgboost
import numpy as np
from sklearn import metrics

data = pd.read_table('D:/test/14/creditcard.csv',sep=',')
X = data.drop({'Time','Class'},axis=1)
Y = data.Class
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size=0.3,random_state=1234)

# 检测到违约占比太低,数据出现严重的偏移,不适合用本数据直接建模,而应该使用SMOTE算法过度抽样
counts = data.Class.value_counts()
print('违约占比%f'%(counts[1]/(counts[0]+counts[1])))
over_sample = SMOTE(random_state=1234)
over_sample_X,over_sample_Y = over_sample.fit_sample(X_train,Y_train)

xgboost = xgboost.XGBClassifier()
xgboost.fit(over_sample_X,over_sample_Y)

xgboost_predict = xgboost.predict(np.array(X_test))
cm = pd.crosstab(xgboost_predict,Y_test)
print('混淆矩阵如下:')
print(cm)
print('基于混淆矩阵的方法检验的结果如下:')
print(metrics.classification_report(Y_test,xgboost_predict))

Y_score = xgboost.predict_proba(np.array(X_test))[:,1]
fpr, tpr,threshold =metrics.roc_curve(Y_test,Y_score)
roc_auc = metrics.auc(fpr,tpr)
print('AUC=%f'%roc_auc)
コード例 #37
0
    try:
        games = pd.read_csv(lg_data_path)
        games = games.dropna(how='any')

        dc_columns = get_config(file="dc_columns/{}".format(league))
        played_data = games.loc[
            (games.Season.isin([1415, 1516, 1617, 1718, 1819]))
            & (games.played == 1)]

        target_1x = played_data.FTR.map({"D": 0, "A": 1, "H": 0})

        # Select significant columns
        dc_data = played_data[dc_columns]

        # Double chance model fit
        sm = SMOTE(random_state=2)
        dc_data_res, target_1x_res = sm.fit_sample(dc_data, target_1x.ravel())

        model = LogisticRegression(C=1e5)
        model.fit(dc_data_res, target_1x_res)
        log.info("0: '1x', 1: 'A' League: {}\t DC score: {}".format(
            league, model.score(dc_data_res, target_1x_res)))
        model_filename = get_analysis_root_path(
            "tools/league_models/{}_dc".format(league))
        joblib.dump(model, model_filename)

    except Exception as e:
        log.warn("New wdw model not built for {}".format(league).upper())
        log.warn("See why:::::: {}".format(e))
log.info("Finished wdw training model")
コード例 #38
0
labels = pd.unique(y)

# split dataset into training/test portions
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0, stratify=y)

# PCA part
pca = PCA(n_components=3).fit(X)
X_pca = pca.transform(X)

pca = PCA(n_components=3).fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# Over-sampling techniques
sm = SMOTE(random_state=1)
X_sm, y_sm = sm.fit_sample(X,y)
X_train_sm, y_train_sm = sm.fit_sample(X_train,y_train)

pca = PCA(n_components=3).fit(X_sm)
X_sm_pca = pca.transform(X_sm)

pca = PCA(n_components=3).fit(X_train_sm)
X_train_sm_pca = pca.transform(X_train_sm)
X_test_sm_pca = pca.transform(X_test)

def draw_learning_curve(X, y, X_pca, filename):
    clf = GaussianNB()
    train_sizes,train_scores, test_scores = learning_curve(
        clf, X, y, cv=10, n_jobs=8)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
コード例 #39
0
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)

#Drop strain category column in x_test and x_all datasets.
x_test = x_test.drop(['Strain Category'], axis=1)
x_all = x_all.drop(['Strain Category'], axis=1)

#Resampling training dataset
train_dataset = pd.concat([x_train, y_train], axis=1, sort=False)
train_strain_cat = train_dataset.loc[:, ['Strain Category']]
train_dataset = train_dataset.drop(['Strain Category'], axis=1)

#Resampling
sampling_method = SMOTE()
train_dataset, train_strain_cat = sampling_method.fit_sample(
    train_dataset, train_strain_cat)
train_dataset = pd.DataFrame(train_dataset)

train_dataset.columns = column_names

#Drop Strain Category column in dataset variable.
#The column is only useful for resampling
dataset = dataset.drop(['Strain Category'], axis=1)

if resample_on == True:
    #Not resampled train dataset
    x_train_no_resample = []
    y_train_no_resample = []
    x_train_no_resample = x_train
    x_train_no_resample = x_train_no_resample.drop(['Strain Category'], axis=1)
    y_train_no_resample = y_train
コード例 #40
0
ファイル: devel.py プロジェクト: tttor/csipb-jamu-prj
def ensembleSmote(xydev):
    xdevf,ydev = xydev
    sm = SMOTE(kind='svm',random_state=sh.getConst('smoteSeed'))
    xdevfr,ydevr = sm.fit_sample(xdevf,ydev)
    return (xdevfr,ydevr)
コード例 #41
0
#PLotting the imbalanced dataset
sn.FacetGrid(data=df_blob2,hue='labels',size=3).map(plt.scatter,'feature_1','feature_2')
plt.legend()
plt.xlabel('feature_1')
plt.ylabel('feature_2')
plt.show()

'''
Technique 3:Creating synthetic features (SMOTE)
Creates new synthetic features rather than just repeating the features as compared to 
traditional oversampling 
'''
from imblearn.over_sampling import SMOTE
rus3 = SMOTE(ratio='minority',k_neighbors=3,random_state=42)
X3_res, y3_res = rus3.fit_sample(X_res, y_res)
y3_res=y3_res.reshape(20000,1)
dataset_blob3= np.concatenate((X3_res,y3_res),axis=1)
df_blob3=pd.DataFrame(dataset_blob3,columns=('feature_1','feature_2','labels'))

print('The number of samples of class 0 and 1:',pd.value_counts(df_blob3['labels'].values, sort=False))

#PLotting the imbalanced dataset
sn.FacetGrid(data=df_blob3,hue='labels',size=3).map(plt.scatter,'feature_1','feature_2')
plt.legend()
plt.xlabel('feature_1')
plt.ylabel('feature_2')
plt.show()


'''
コード例 #42
0
from ay_hw_4._global import ROOT_PATH, APS_SHRINK, APS_FULL_COLUMNS
from ay_hw_4.util_data import load_data, to_binary_numeric

GENERATED_SMOTE_TRAIN_DATA_FILE_PATH = './gen_smote_train_shrink_data_set.csv'

if __name__ == "__main__":
    X_train, y_train = load_data(ROOT_PATH + APS_SHRINK,
                                 skip_first_row=21,
                                 y_column_index=0,
                                 assignedColumnNames=APS_FULL_COLUMNS,
                                 missingSymbol='na',
                                 needImpute=True,
                                 dropOrNot=True)

    smote = SMOTE(random_state=2333)
    smote_train_fit = smote.fit_sample(X_train, y_train)
    X_train_smote = pd.DataFrame(smote_train_fit[0])
    y_train_smote = pd.DataFrame(smote_train_fit[1], columns=['class'])
    export_smote_train_data = pd.concat([y_train_smote, X_train_smote], axis=1)

    # export data to csv
    export_smote_train_data.to_csv(GENERATED_SMOTE_TRAIN_DATA_FILE_PATH,
                                   sep=',',
                                   index=False)
    smote_train_data = convert.load_any_file(
        filename=GENERATED_SMOTE_TRAIN_DATA_FILE_PATH)
    smote_train_data.class_is_first()

    # load logistic model tree algorithm
    log_tree = Classifier(classname="weka.classifiers.trees.LMT")
    eval_smote_train_obj = Evaluation(smote_train_data)
コード例 #43
0
"""COUNT OF ORIGINAL DATA"""

class_zero = 0
class_one = 0
for i in range(0,len(y_train)):
  if y_train[i]==0:
    class_zero = class_zero+1
  else:
    class_one = class_one + 1
print(class_zero, class_one)

"""OVERSAMPLING"""

smt = SMOTE()
over_x_train, over_y_train = smt.fit_sample(x_train_mod, y_train)
np.bincount(over_y_train)

"""UNDERSAMPLING"""

nr = NearMiss()
under_x_train, under_y_train = nr.fit_sample(x_train_mod, y_train)
np.bincount(under_y_train)

"""CLASSIFIER"""

def acc(pred, actual):
  tp = fp = tn = fn = 0
  for i in range(0, len(pred)):
    if np.round(pred[i]) == actual[i]:
      if np.round(pred[i])==0:
コード例 #44
0
labelencoder_y = LabelEncoder()
df_product['gender'] = labelencoder_y.fit_transform(df_product['gender'])

#Target varible - class proportion
df_product.groupby(['gender'])['day_of_week_1'].count()

#Creating X - feature and y - target. Train test split - 80:20
X = df_product.iloc[:, :-1].values
y = df_product.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)
#Oversampling using SMOTE - Synthetic minority Oversampling technique
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
print(X_train.shape)
print(y_train.shape)

#logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
score = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))

#Neural network
コード例 #45
0
# %% Feature selection
dataProsSampleScaledSelected, classes = function.featureSelection(
    dataProsSampleScaled, y)
dataProsSampleScaledSelected_n, classes_n = function.featureSelection(
    dataProsSample_n, y_n)
# %% Transform data
pca = PCA(n_components=2)

X = pca.fit_transform(dataProsSampleScaledSelected)
x_n = pca.fit_transform(dataProsSampleScaledSelected_n)
plotter.plot_2d_space(X, y, 'Imbalanced dataset (2 PCA components)')

# %%
smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)

plotter.plot_2d_space(X_sm, y_sm, 'SMOTE over-sampling')

# %%

matrixes = function.run(dataProsSampleScaledSelected_n, y_n, dataLabel.columns)

# %%
dataTail = data.tail(1000)  # .where(data["poistunut"] == 1)
dataTail = dataTail.dropna(subset=["poistunut"])
dataTailPros = pre.prepare(dataTail.copy())

y_tail = dataTailPros.loc[:, "poistunut"]
drop = ["poistunut", "kasko_poistunut", "fetu_poistunut", "liikenne_poistunut"]
dataLabelTail, labelsTail = pre.labeling(dataTailPros.drop(columns=drop))
コード例 #46
0
        cbar=False)
    plt.xlabel("true label")
    plt.ylabel("predicted label")
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)


plot_conf_mat(y_test, y_preds)

#Plot ROC curve and calculate Auc metric
plot_roc_curve(svm_model, test_matrix, y_test)

# applying SMOTE to balance the data
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train1, y_train1 = sm.fit_sample(X_train_transformed, y_train)
y_train1.value_counts()
y_train.value_counts()

svm_model = LinearSVC()
svm_model.fit(X_train1, y_train1)
train_pred_svc = svm_model.predict(X_train1)
accuracy_train_svc = np.mean(train_pred_svc == y_train1)
accuracy_train_svc
y_preds = svm_model.predict(X_test_transformed)
accuracy_test_svc = np.mean(y_preds == y_test)

print(classification_report(y_test, y_preds))
pd.crosstab(y_test, y_preds)

# Applying k-Fold Cross Validation
コード例 #47
0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#print("Number transactions training datasets: ", X_train.shape)
#print("Number transactions testing datasets: ", X_test.shape)

# Handling class imbalance

#one_hot["Loan_Status"].value_counts()   # Returns the number of elements in each category

from imblearn.over_sampling import SMOTE

#print("Before OverSampling - # of label 1: {}".format(sum(y_train==1)))
#print("Before OverSampling - # of label 0: {} \n".format(sum(y_train==0)))

sm = SMOTE(sampling_strategy=1.0, random_state=25)
X_train_new, y_train_new = sm.fit_sample(X_train, y_train)

#print("==============================================")

#print('After OverSampling - X_train shape: {}'.format(X_train_new.shape))
#print('After OverSampling - t_train shape: {} \n'.format(y_train_new.shape))

#print("After OverSampling - # of label 1: {}".format(sum(y_train_new==1)))
#print("After OverSampling - # of label 0: {}".format(sum(y_train_new==0)))

# Building the model

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score

# Logistic Regression
コード例 #48
0

X = data_final.loc[:, data_final.columns != 'y']
y = data_final.loc[:, data_final.columns == 'y']


# In[101]:


from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))


# ### Recursive feature elimination

# In[102]:

コード例 #49
0
def classifyUsers(profile,class_names = ["Negative","Positive"],stand_col_names=None,im_balance=False):
    """
    根据输入的用户特征,来进行分类。其中,数据集中最后一列为需要预测的特征,会自动进行正则化以及
    丢掉缺失数据。还可以对不平衡的数据集进行插补。但是输入DataFrame必须无名义性或者字符型变量,
    要提前进行编码。
    
    目前默认采用三种分类算法,分别为决策树,随机森林,逻辑回归。
    能自动生成结果,包括混淆矩阵的各个指标,以及重要的结果参数。
    :param profile_path: 存储用户特征文件的路径,其中最后一个特征为预测特征
    :param class_names: 类别名称
    :param stand_col_names: 需要正则化的特征名,如果无,则自动正则化所有数值型特征
    :param im_balance: 是否需要采用SMOT-Boderline使得两类数据变得均衡
    :return: 
    """

    # profile = pd.read_csv("/home/maoan/maidianAnalysis/level3-growth/userProfile.csv")
    # profile = pd.read_csv("/home/maoan/maidianAnalysis/level3-growth/user_actions.csv")
    # profile = pd.read_csv(profile_path)

    ## Construct the features name.
    features = profile.columns.tolist()[:-1]
    pred_feature= profile.columns.tolist()[-1]


    ## basic data preprocessing
    profile.dropna(axis=0, how='any', inplace=True)
    # col_names = ['Freq','BattleRatio']
    # profile = transfromFeatures(profile)
    # standarlization
    norm_profile, _ = standarization(profile,stand_col_names)

    print("Features are: {0}".format(features))

    X = profile[features].as_matrix()
    y = profile[pred_feature].as_matrix()

    norm_x = norm_profile[features].as_matrix()
    norm_y = norm_profile[pred_feature].as_matrix()


    # dealing with the imbalanced data problem
    if im_balance:
        print('Original dataset shape {}'.format(Counter(norm_y)))
        print(np.median(norm_x, axis=0))
        sm = SMOTE(random_state=42,kind="borderline2")
        X_res, y_res = sm.fit_sample(norm_x, norm_y)
        X,y = X_res, y_res
        norm_x,norm_y = X_res, y_res

    # class_names = ['Non-VIP', 'VIP']

    ## choose the classifier and set the parameters
    min_split = 20
    max_dep = 3

    decisionTreeClassify(X,y,features,class_names,min_samples_split=min_split, max_depth=max_dep)

    # use normalized data
    logiRegressionClassify(norm_x,norm_y,features,class_names,penalty="l1")

    randomForestClassify(X,y,features,class_names,n_estimators=250)
コード例 #50
0
X_ts3, y_ts3, _ = load_data(testing_data_path3)
y_tr3 = np.array(map(int, y_tr3))
y_ts3 = np.array(map(int, y_ts3))
# generating a big dataset with training and testing samples
X3 = np.concatenate((X_tr3, X_ts3))
Y3 = np.concatenate((y_tr3, y_ts3))

sm = SMOTE(random_state=42)
'''
# Resampling only training data
X1,Y1 = sm.fit_sample(X_tr1, y_tr1)
X2,Y2 = sm.fit_sample(X_tr2, y_tr2)
X3,Y3 = sm.fit_sample(X_tr3, y_tr3)
'''
# Resampling the entire datasets
X1, Y1 = sm.fit_sample(X1, Y1)
X2, Y2 = sm.fit_sample(X2, Y2)
X3, Y3 = sm.fit_sample(X3, Y3)

pos_names = dict(
    zip(['A', 'C', 'D', 'F', 'I', 'N', 'P', 'R', 'S', 'V', 'W', 'Z'], [
        'adjective', 'conjunction', 'determiner', 'punctuation',
        'interjection', 'noun', 'pronoun', 'adverb', 'adposition', 'verb',
        'date', 'number'
    ]))

clf1 = tree.DecisionTreeClassifier(class_weight=None,
                                   criterion='entropy',
                                   max_depth=4,
                                   max_features=None,
                                   max_leaf_nodes=None,
コード例 #51
0
def KFoldCrossValidation(train_and_test_indexes,
                         X_data_frame,
                         y_data_frame,
                         k_value=3,
                         kcv_value=9,
                         smote=True,
                         debug=False):
    train_indexes = train_and_test_indexes[0]
    #print('Train Indexes:',train_indexes)
    test_indexes = train_and_test_indexes[1]
    #print('Test Indexes:',test_indexes)

    knn = KNeighborsClassifier(n_neighbors=k_value)

    #if debug:
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index, "\n")

    # STEP 1: split data between test and train sets
    if debug:
        print('* Starting train and test sets splitting... ', end='')

    y_data = np.ravel(y_data_frame)  # Added to solve column-vector issue

    X_train, X_test, y_train, y_test = X_data_frame[
        train_indexes], X_data_frame[test_indexes], y_data[
            train_indexes], y_data[test_indexes]
    #print('y_data[test_indexes]:',y_data[test_indexes])
    if debug:
        print('Done!')

    # print the shapes of the new X objects
    if debug:
        print('* Display X and y objects\'s shape:')
        print('\t X_train.shape: ', X_train.shape)
        print('\t X_test.shape: ', X_test.shape)
        print('\t y_train.shape: ', y_train.shape)
        print('\t y_test.shape: ', y_test.shape)

    # SMOTE HERE

    if smote:
        # Oversampling training data using SMOTE
        if debug:
            print('* Starting to oversample training data using SMOTE...')
            print(
                '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=',
                (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2)))
            print(
                '\t -Number of instances inside TEST set from each class BEFORE to apply SMOTE=',
                (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2)))
            print(
                '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=',
                (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2)))
            print(
                '\t -Number of instances inside TEST set  from each class BEFORE to apply SMOTE=',
                (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2)))

        from imblearn.over_sampling import SMOTE
        smt = SMOTE()
        X_train, y_train = smt.fit_sample(X_train, y_train)

        if debug:
            print('\t -Instances amount from each class AFTER to apply SMOTE=',
                  (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2)))

    #print('y_train:',y_train)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    #print('y_test:',y_data[test_indexes])
    #print('y_pred=',y_pred)

    # comparing actual response values (y_test) with predicted response values (y_pred)
    this_accuracy = metrics.accuracy_score(y_test, y_pred)
    this_confusion_matrix = metrics.confusion_matrix(y_test,
                                                     y_pred,
                                                     labels=None,
                                                     sample_weight=None)

    return this_accuracy, this_confusion_matrix
コード例 #52
0
def runMODEL(X_data,
             y_data,
             k_value=3,
             knn_debug=False,
             use_smote=True,
             use_rescaling=True,
             cv_type='kcv',
             kcv_value=9,
             use_Pool=False,
             model=__MODEL):

    # Main variables
    #    global __MODEL
    #    model = __MODEL
    #knn = KNeighborsClassifier(n_neighbors=k_value)
    accuracy = 0
    confusion_matrix = []

    if knn_debug:
        print('* Checking arguments formating...')
        print('\t-X_data.shape=', X_data.shape)
        print('\t-y_data.shape=', y_data.shape)

    # Data preparation
    try:
        new_dimensions = (X_data.shape[0], X_data.shape[1] * X_data.shape[2])
    except IndexError:
        print('** IndexValue exception')
        print('\tX_data.shape=', X_data.shape)
        print('\ty_data.shape=', y_data.shape)
        print('\t')
        sys.exit(-1)

    if knn_debug:
        print('* Reshaping for this data partition with these dimensions:',
              new_dimensions)

    new_partition = np.reshape(X_data, new_dimensions)

    if knn_debug:
        print('...done')

    if knn_debug:
        print('* The shape of a line data retrived from the new partition=',
              new_partition[0].shape)

    ## KNN preparation

    # Preparing data to use with PANDAS
    X_pandas = pd.DataFrame(data=new_partition)
    y_pandas = pd.DataFrame(data=y_data)

    if knn_debug:
        print('* Preparing data to use with Pandas...')
        print('X_pandas=\n', X_pandas)
        print('y_pandas=\n', y_pandas)

    # Rescalling data
    if use_rescaling:
        if knn_debug:
            print('* Rescalling data... ', end='')
        from sklearn import preprocessing
        scaler = preprocessing.StandardScaler()
        X_pandas = scaler.fit_transform(
            X_pandas)  # Fit your data on the scaler object
        if knn_debug:
            print('done')
            print('Rescaled X_pandas=\n', X_pandas)

    #######################################################
    if cv_type == 'kcv':  # K-FOLD CROSS VALIDATION
        scores = []
        matrices = []
        all_results = []

        cv = KFold(n_splits=kcv_value, random_state=42, shuffle=True)
        both_indexes = cv.split(X_pandas)

        #-----------------------------------
        if not use_Pool:  # Single Thread KCrossValidation

            # Single Thread KCrossValidation
            for indexes in both_indexes:

                result = KFoldCrossValidation(indexes, X_pandas, y_pandas,
                                              k_value, kcv_value, use_smote,
                                              knn_debug)
                all_results.append(result)

            for acc_with_cmat in all_results:
                acc = acc_with_cmat[0]
                cmat = acc_with_cmat[1]
                scores.append(acc)
                matrices.append(cmat)

            np_scores = np.array(scores)
            best_pos = np_scores.argmax()

            accuracy = np.mean(np_scores)
            confusion_matrix = matrices[best_pos]

        #-----------------------------------
        else:
            # Multi Thread KCrossValidation
            # NOT WORKING YET!!
            cores_num = multiprocessing.cpu_count()
            with Pool(processes=cores_num) as p:
                from functools import partial

                all_results = p.map(
                    partial(KFoldCrossValidation,
                            X_data_frame=X_pandas,
                            y_data_frame=y_pandas,
                            k_value=k_value,
                            kcv_value=kcv_value,
                            smote=use_smote,
                            debug=knn_debug), both_indexes)

            for acc_with_cmat in all_results:
                acc = acc_with_cmat[0]
                cmat = acc_with_cmat[1]
                scores.append(acc)
                matrices.append(cmat)

            np_scores = np.array(scores)
            best_pos = np_scores.argmax()

            accuracy = np.mean(np_scores)
            confusion_matrix = matrices[best_pos]


#########################
    else:

        # validation with simple split data between test and train sets
        if knn_debug:
            print('* Starting train and test sets splitting... ', end='')
        X_train, X_test, y_train, y_test = train_test_split(X_pandas,
                                                            np.ravel(y_pandas),
                                                            test_size=0.3,
                                                            random_state=12)
        if knn_debug:
            print('done')

        # print the shapes of the new X objects
        if knn_debug:
            print('* Display X and y objects\'s shape:')
            print('\t X_train.shape: ', X_train.shape)
            print('\t X_test.shape: ', X_test.shape)
            print('\t y_train.shape: ', y_train.shape)
            print('\t y_test.shape: ', y_test.shape)

        if use_smote:
            # Oversampling training data using SMOTE
            if knn_debug:
                print('* Starting to oversample training data using SMOTE...')
                print(
                    '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=',
                    (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2)))
                print(
                    '\t -Number of instances inside TEST set from each class BEFORE to apply SMOTE=',
                    (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2)))
                print(
                    '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=',
                    (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2)))
                print(
                    '\t -Number of instances inside TEST set  from each class BEFORE to apply SMOTE=',
                    (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2)))

            from imblearn.over_sampling import SMOTE
            smt = SMOTE()
            X_train, y_train = smt.fit_sample(X_train, y_train)

            if knn_debug:
                print(
                    '\t -Instances amount from each class AFTER to apply SMOTE=',
                    (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2)))

        # print the shapes of the new X and y objects
        if knn_debug:
            print(
                '* Display X and y objects\'s shape after apply SMOTE to this sets:'
            )
            print('\t X_train.shape: ', X_train.shape)
            print('\t X_test.shape (should be the same): ', X_test.shape)
            print('\t y_train.shape: ', y_train.shape)
            print('\t y_test.shape (should be the same): ', y_test.shape)

        # STEP 2: train the model on the training set
        #knn = KNeighborsClassifier(n_neighbors=k_value)
        model.fit(X_train, y_train)

        # STEP 3: make predictions on the testing set
        y_pred = model.predict(X_test)
        #if knn_debug:
        #    print('y_pred=\n',y_pred)
        #    print('y_pred.shape:',y_pred.shape)

        # compare actual response values (y_test) with predicted response values (y_pred)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        confusion_matrix = metrics.confusion_matrix(y_test,
                                                    y_pred,
                                                    labels=None,
                                                    sample_weight=None)

    return accuracy, confusion_matrix
コード例 #53
0
def test_smote_wrong_kind():
    kind = 'rnd'
    smote = SMOTE(kind=kind, random_state=RND_SEED)
    with raises(ValueError, match="Unknown kind for SMOTE"):
        smote.fit_sample(X, Y)
コード例 #54
0
from sklearn.linear_model import LogisticRegression

# code starts here
model = LogisticRegression(random_state=6)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = accuracy_score(y_pred, y_test)
# Code ends here

# --------------
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# code starts here
smote = SMOTE(random_state=9)
X_train, y_train = smote.fit_sample(X_train, y_train)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Code ends here

# --------------
# Code Starts here
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = accuracy_score(y_pred, y_test)
# Code ends here
コード例 #55
0
            del a[3]
        if (radon):
            del a[4]
        if (zipc):
            del a[5]
        a = np.array(a)
        b.append(a)
    b = np.array(b)
    return b


X_train = deleteFeatures(False, False, False, False, False, False, X_train)
X_test = deleteFeatures(False, False, False, False, False, False, X_test)

smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
X_test, y_test = smt.fit_sample(X_test, y_test)


# Function to get metrics
def evaluate_model(metrics, model, y_test, X_test):
    y_pred = model.predict(X_test, verbose=1)
    y_pred_coded = np.where(y_pred > 0.5, 1, 0)
    y_pred_coded = y_pred_coded.flatten()
    metric = []
    metric.append(['f1score', f1_score(y_test, y_pred_coded)])
    metric.append(['precision', precision_score(y_test, y_pred_coded)])
    metric.append(['recall', recall_score(y_test, y_pred_coded)])
    metric.append(['accuracy', accuracy_score(y_test, y_pred_coded)])
    metrics.append(metric)
    return metrics, y_pred
コード例 #56
0
    logger.info('ROC AUC score: ' + str(roc_auc_score(y_val, prob1)))
    logger.info('Precision Recall AUC score: ' +
                str(funciones.precision_recall_auc_score(y_val, prob1)))
    logger.info('F1 score: ' + str(f1_score(y_val, pred1)))
    logger.info('Balanced accuracy score: ' +
                str(balanced_accuracy_score(y_val, pred1)))
    logger.info('Precission score: ' + str(precision_score(y_val, pred1)))
    logger.info('Recall score: ' + str(recall_score(y_val, pred1)))

    logger.info('***' * 20)

    ############ Metodo 8: Smote oversampling
    from imblearn.over_sampling import SMOTE
    logger.info('SMOTE: ')
    smote = SMOTE(sampling_strategy=samp, random_state=42, n_jobs=n_proc)
    X_rus, y_rus = smote.fit_sample(x_tra, y_tra)
    classifier8 = clone(classifier)
    classifier8.fit(X_rus, y_rus)
    pred1 = classifier8.predict(x_val)
    prob1 = classifier8.predict_proba(x_val)[:, 1]
    print('\n\nSMOTE: ')
    print('ROC AUC score: ' + str(roc_auc_score(y_val, prob1)))
    print('Precision Recall AUC score: ' +
          str(funciones.precision_recall_auc_score(y_val, prob1)))
    print('F1 score: ' + str(f1_score(y_val, pred1)))
    print('Balanced accuracy score: ' +
          str(balanced_accuracy_score(y_val, pred1)))
    print('Precission score: ' + str(precision_score(y_val, pred1)))
    print('Recall score: ' + str(recall_score(y_val, pred1)))

    logger.info('ROC AUC score: ' + str(roc_auc_score(y_val, prob1)))
コード例 #57
0
from imblearn.over_sampling import SMOTE

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Borderline SMOTE 1
sm = SMOTE(kind='borderline1')
X_resampled, y_resampled = sm.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
コード例 #58
0
ファイル: train.py プロジェクト: JohnnyChangCheng/DLAPP_LAB
def run_program(hparams, FLAGS):
    # load dataset
    num_folds = FLAGS.num_folds
    data_dir = FLAGS.data_dir
    data_version = 2013

    output_dir = FLAGS.output_dir
    classes = ['W', 'N1', 'N2', 'N3', 'REM']
    n_classes = len(classes)

    path, channel_ename = os.path.split(data_dir)
    traindata_dir = os.path.join(
        os.path.abspath(os.path.join(data_dir, os.pardir)), 'traindata/')
    print(str(datetime.now()))

    def evaluate_model(hparams, X_test, y_test, classes):
        # acc_track = []
        n_classes = len(classes)
        y_true = []
        y_pred = []
        alignments_alphas_all = []  # (batch_num,B,max_time_step,max_time_step)
        for batch_i, (source_batch, target_batch) in enumerate(
                batch_data(X_test, y_test, hparams.batch_size)):
            pred_outputs_ = sess.run(pred_outputs,
                                     feed_dict={
                                         inputs: source_batch,
                                         keep_prob_: 1.0
                                     })
            alignments_alphas = sess.run(dec_states.alignment_history.stack(),
                                         feed_dict={
                                             inputs: source_batch,
                                             dec_inputs: target_batch[:, :-1],
                                             keep_prob_: 1.0
                                         })

            # acc_track.append(np.mean(dec_input == target_batch))
            pred_outputs_ = pred_outputs_[:, :hparams.
                                          max_time_step]  # remove the last prediction <EOD>
            target_batch_ = target_batch[:, 1:
                                         -1]  # remove the last <EOD> and the first <SOD>
            # acc_track.append(pred_outputs_ == target_batch_)

            alignments_alphas = alignments_alphas.transpose((1, 0, 2))
            alignments_alphas = alignments_alphas[:, :hparams.max_time_step]
            alignments_alphas_all.append(alignments_alphas)

            _y_true = target_batch_.flatten()
            _y_pred = pred_outputs_.flatten()

            y_true.extend(_y_true)
            y_pred.extend(_y_pred)

        cm = confusion_matrix(y_true, y_pred, labels=range(n_classes))
        ck_score = cohen_kappa_score(y_true, y_pred)
        acc_avg, acc, f1_macro, f1, sensitivity, specificity, PPV = evaluate_metrics(
            cm, classes)
        # print ("batch_i: {}").format(batch_i)

        print(
            'Average Accuracy -> {:>6.4f}, Macro F1 -> {:>6.4f} and Cohen\'s Kappa -> {:>6.4f} on test set'
            .format(acc_avg, f1_macro, ck_score))

        for index_ in range(n_classes):
            print(
                "\t{} rhythm -> Sensitivity: {:1.4f}, Specificity: {:1.4f}, Precision (PPV): {:1.4f}, F1 : {:1.4f} Accuracy: {:1.4f}"
                .format(classes[index_], sensitivity[index_],
                        specificity[index_], PPV[index_], f1[index_],
                        acc[index_]))

        print(
            "\tAverage -> Sensitivity: {:1.4f}, Specificity: {:1.4f}, Precision (PPV): {:1.4f}, F1-score: {:1.4f}, Accuracy: {:1.4f}"
            .format(np.mean(sensitivity), np.mean(specificity), np.mean(PPV),
                    np.mean(f1), np.mean(acc)))

        return acc_avg, f1_macro, ck_score, y_true, y_pred, alignments_alphas_all

    def count_prameters():
        print(
            '# of Params: ',
            np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.trainable_variables()
            ]))

    for fold_idx in range(num_folds):
        start_time_fold_i = time.time()
        data_loader = SeqDataLoader(data_dir,
                                    num_folds,
                                    fold_idx,
                                    classes=classes)
        X_train, y_train, X_test, y_test = data_loader.load_data(
            seq_len=hparams.max_time_step)

        # preprocessing
        char2numY = dict(zip(classes, range(len(classes))))
        pre_f1_macro = 0

        # <SOD> is a token to show start of decoding  and <EOD> is a token to indicate end of decoding
        char2numY['<SOD>'] = len(char2numY)
        char2numY['<EOD>'] = len(char2numY)
        num2charY = dict(zip(char2numY.values(), char2numY.keys()))

        # over-sampling: SMOTE:
        X_train = np.reshape(X_train,
                             [X_train.shape[0] * X_train.shape[1], -1])
        y_train = y_train.flatten()

        nums = []
        for cl in classes:
            nums.append(len(np.where(y_train == char2numY[cl])[0]))

        if (os.path.exists(traindata_dir) == False):
            os.mkdir(traindata_dir)
        fname = os.path.join(
            traindata_dir, 'trainData_' + channel_ename + '_SMOTE_all_10s_f' +
            str(fold_idx) + '.npz')

        if (os.path.isfile(fname)):
            X_train, y_train, _ = data_loader.load_npz_file(fname)

        else:
            n_osamples = nums[2] - 7000
            ratio = {
                0: n_osamples if nums[0] < n_osamples else nums[0],
                1: n_osamples if nums[1] < n_osamples else nums[1],
                2: nums[2],
                3: n_osamples if nums[3] < n_osamples else nums[3],
                4: n_osamples if nums[4] < n_osamples else nums[4]
            }

            sm = SMOTE(random_state=12, ratio=ratio)
            X_train, y_train = sm.fit_sample(X_train, y_train)
            data_loader.save_to_npz_file(X_train, y_train,
                                         data_loader.sampling_rate, fname)

        X_train = X_train[:(X_train.shape[0] // hparams.max_time_step) *
                          hparams.max_time_step, :]
        y_train = y_train[:(X_train.shape[0] // hparams.max_time_step) *
                          hparams.max_time_step]

        X_train = np.reshape(X_train, [-1, X_test.shape[1], X_test.shape[2]])
        y_train = np.reshape(y_train, [
            -1,
            y_test.shape[1],
        ])

        # shuffle training data_2013
        permute = np.random.permutation(len(y_train))
        X_train = np.asarray(X_train)
        X_train = X_train[permute]
        y_train = y_train[permute]

        # add '<SOD>' to the beginning of each label sequence, and '<EOD>' to the end of each label sequence (both for training and test sets)
        y_train = [[char2numY['<SOD>']] + [y_ for y_ in date] +
                   [char2numY['<EOD>']] for date in y_train]
        y_train = np.array(y_train)

        y_test = [[char2numY['<SOD>']] + [y_ for y_ in date] +
                  [char2numY['<EOD>']] for date in y_test]
        y_test = np.array(y_test)

        print('The training set after oversampling: ', classes)
        for cl in classes:
            print(cl, len(np.where(y_train == char2numY[cl])[0]))

        # training and testing the model
        if (os.path.exists(FLAGS.checkpoint_dir) == False):
            os.mkdir(FLAGS.checkpoint_dir)

        if (os.path.exists(output_dir) == False):
            os.makedirs(output_dir)

        loss_track = []
        with tf.Graph().as_default(), tf.Session() as sess:

            # Placeholders
            inputs = tf.placeholder(
                tf.float32, [None, hparams.max_time_step, hparams.input_depth],
                name='inputs')
            targets = tf.placeholder(tf.int32, (None, None), 'targets')
            dec_inputs = tf.placeholder(tf.int32, (None, None),
                                        'decoder_inputs')
            keep_prob_ = tf.placeholder(tf.float32, name='keep')

            # model
            logits, pred_outputs, loss, optimizer, dec_states = build_whole_model(
                hparams, char2numY, inputs, targets, dec_inputs, keep_prob_)
            count_prameters()
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            saver = tf.train.Saver()
            print(str(datetime.now()))

            ckpt_name = "model_fold{:02d}.ckpt".format(fold_idx)
            ckpt_exist = False
            for file in os.listdir(FLAGS.checkpoint_dir):
                if file.startswith(ckpt_name):
                    ckpt_exist = True
            ckpt_name = os.path.join(FLAGS.checkpoint_dir, ckpt_name)

            if ckpt_exist:
                saver.restore(sess, ckpt_name)
                evaluate_model(hparams, X_test, y_test, classes)
            else:
                for epoch_i in range(hparams.epochs):
                    start_time = time.time()
                    # train_acc = []
                    y_true = []
                    y_pred = []
                    for batch_i, (source_batch, target_batch) in enumerate(
                            batch_data(X_train, y_train, hparams.batch_size)):
                        _, batch_loss, batch_logits = sess.run(
                            [optimizer, loss, logits],
                            feed_dict={
                                inputs: source_batch,
                                dec_inputs: target_batch[:, :-1],
                                targets: target_batch[:, 1:],
                                keep_prob_: 0.5
                            }  #,
                        )
                        loss_track.append(batch_loss)
                        # train_acc.append(batch_logits.argmax(axis=-1) == target_batch[:,1:])
                        y_pred_ = batch_logits[:, :hparams.
                                               max_time_step].argmax(axis=-1)
                        y_true_ = target_batch[:, 1:-1]

                        y_true.extend(y_true_)
                        y_pred.extend(y_pred_)

                    # accuracy = np.mean(train_acc)
                    y_true = np.asarray(y_true)
                    y_pred = np.asarray(y_pred)
                    y_true = y_true.flatten()
                    y_pred = y_pred.flatten()
                    n_examples = len(y_true)
                    cm = confusion_matrix(y_true,
                                          y_pred,
                                          labels=range(len(char2numY) - 2))
                    accuracy = np.mean(y_true == y_pred)
                    mf1 = f1_score(y_true, y_pred, average="macro")
                    ck_score = cohen_kappa_score(y_true, y_pred)

                    print(
                        'Epoch {:3} Loss: {:>6.3f} Accuracy: {:>6.4f} F1-score: {:>6.4f} Cohen\'s Kappa: {:>6.4f} Epoch duration: {:>6.3f}s'
                        .format(epoch_i, np.mean(batch_loss), accuracy, mf1,
                                ck_score,
                                time.time() - start_time))

                    if (epoch_i + 1) % hparams.test_step == 0:
                        acc_avg, f1_macro, ck_score, y_true, y_pred, alignments_alphas_all = evaluate_model(
                            hparams, X_test, y_test, classes)

                        if np.nan_to_num(
                                f1_macro
                        ) > pre_f1_macro:  # save the better model based on the f1 score
                            print(
                                'Loss {:.4f} after {} epochs (batch_size={})'.
                                format(loss_track[-1], epoch_i + 1,
                                       hparams.batch_size))
                            pre_f1_macro = f1_macro
                            ckpt_name = "model_fold{:02d}.ckpt".format(
                                fold_idx)
                            save_path = os.path.join(FLAGS.checkpoint_dir,
                                                     ckpt_name)
                            saver.save(sess, save_path)
                            print(
                                "The best model (till now) saved in path: %s" %
                                save_path)

                            # Save
                            save_dict = {
                                "y_true":
                                y_true,
                                "y_pred":
                                y_pred,
                                "ck_score":
                                ck_score,
                                "alignments_alphas_all":
                                alignments_alphas_all[:
                                                      200],  # we save just the first 200 batch results because it is so huge
                            }

                            filename = "output_" + channel_ename + "_fold{:02d}.npz".format(
                                fold_idx)
                            save_path = os.path.join(output_dir, filename)
                            np.savez(save_path, **save_dict)
                            print(
                                "The best results (till now) saved in path: %s"
                                % save_path)

                # plt.plot(loss_track)
                # plt.show()

            print(str(datetime.now()))
            print('Fold{} took: {:>6.3f}s'.format(
                fold_idx,
                time.time() - start_time_fold_i))
コード例 #59
0
    print(len(x_features_test))
    return (x_features_train, x_features_test, x_labels_train, x_labels_test)


data = pd.read_csv('creditcard.csv')
os = SMOTE(
    random_state=0)  #   We are using SMOTE as the function for oversampling
print(os)

# now we can devided our data into training and test data
# Call our method data prepration on our dataset
data_train_X, data_test_X, data_train_y, data_test_y = data_prepration(data)
print(type(data_test_X))
columns = data_train_X.columns

os_data_X, os_data_y = os.fit_sample(
    data_train_X, data_train_y)  # The array containing the resampled data.
os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
os_data_y = pd.DataFrame(data=os_data_y, columns=["Class"])
# we can Check the numbers of our data

print("length of oversampled data is ", len(os_data_X))
print("Number of normal transcation in oversampled data",
      len(os_data_y[os_data_y["Class"] == 0]))
print("No.of fraud transcation", len(os_data_y[os_data_y["Class"] == 1]))
print("Proportion of Normal data in oversampled data is ",
      len(os_data_y[os_data_y["Class"] == 0]) / len(os_data_X))
print("Proportion of fraud data in oversampled data is ",
      len(os_data_y[os_data_y["Class"] == 1]) / len(os_data_X))

os_data_X["Normalized Amount"] = StandardScaler().fit_transform(
    os_data_X['Amount'].values.reshape(-1, 1))
コード例 #60
0
def smote_tech(X, Y):

    smote = SMOTE(ratio='minority')
    X_sm, Y_sm = smote.fit_sample(X, Y)
    return X_sm, Y_sm