Ejemplo n.º 1
0
def resample_train_data(train_data, n, frac):
    numeric_attrs = ['age', 'duration', 'campaign', 'pdays', 'previous',
                 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
                 'euribor3m', 'nr.employed',]
    #numeric_attrs = train_data.drop('y',axis=1).columns
    pos_train_data_original = train_data[train_data['y'] == 1]
    pos_train_data = train_data[train_data['y'] == 1]
    new_count = n * pos_train_data['y'].count()
    neg_train_data = train_data[train_data['y'] == 0].sample(frac=frac)
    train_list = []
    if n != 0:
        pos_train_X = pos_train_data[numeric_attrs]
        pos_train_X2 = pd.concat([pos_train_data.drop(numeric_attrs, axis=1)] * n)
        pos_train_X2.index = range(new_count)
        
        s = smote.Smote(pos_train_X.values, N=n, k=3)
        pos_train_X = s.over_sampling()
        pos_train_X = pd.DataFrame(pos_train_X, columns=numeric_attrs, 
                                   index=range(new_count))
        pos_train_data = pd.concat([pos_train_X, pos_train_X2], axis=1)
        pos_train_data = pd.DataFrame(pos_train_data, columns=pos_train_data_original.columns)
        train_list = [pos_train_data, neg_train_data, pos_train_data_original]
    else:
        train_list = [neg_train_data, pos_train_data_original]
    print("Size of positive train data: {} * {}".format(pos_train_data_original['y'].count(), n+1))
    print("Size of negative train data: {} * {}".format(neg_train_data['y'].count(), frac))
    train_data = pd.concat(train_list, axis=0)
    return shuffle(train_data)
Ejemplo n.º 2
0
    def __init__(self,
                 sql_code='select * from wepon_d1',
                 smote_pro=True,
                 smote_k=5):
        conn = cx.connect('coupon/coupon@pai_db')
        self.sql_code = sql_code
        df = pd.read_sql(sql_code, conn)
        X1 = df[x_columns]
        y1 = df[target]
        s_label_count = [(i, y1[y1 == i].count()) for i in y1.unique()]
        y1 = np.asarray(y1)

        X1 = X1.applymap(lambda x: 0 if x is None else x)
        X1 = X1.applymap(lambda x: 0 if np.isnan(x) else x)
        X1 = X1.applymap(lambda x: x * 1.0)
        datas = np.asanyarray(X1, np.float32)

        if smote_pro:
            s_label_count.sort(key=lambda x: x[1], reverse=True)
            _, max_label_count = s_label_count[0]
            for s_label, s_count in s_label_count[1:]:
                s = smote.Smote(N=math.ceil(max_label_count / s_count),
                                k=smote_k)
                smotedata = s.fit_transform(np.asarray(datas[y1 == s_label]))
                datas = np.vstack(
                    [datas, smotedata[:max_label_count - s_count]])
                add_y1 = np.zeros(max_label_count - s_count, dtype='int32')
                add_y1[:] = s_label
                y1 = np.hstack([y1, add_y1])

        self.datas = datas
        self.labels = np.zeros([len(y1), 2])
        self.labels[:, 0] = np.where(y1 == 0, 1, 0)
        self.labels[:, 1] = np.where(y1 == 1, 1, 0)
Ejemplo n.º 3
0
if __name__ == '__main__':
    data = loaddata()
    data = preprocess(data)
    x_train, x_test, y_train, y_test = train_test_split(data['cus_comment'],
                                                        data['target'],
                                                        random_state=3,
                                                        test_size=0.25)

    stopwords = load_stopwords()
    x_train_fenci = fenci(x_train)
    tf = feature_extraction(x_train_fenci, stopwords)

    x_train_tf = tf.transform(x_train_fenci).toarray()
    samples0 = []
    for i, label in enumerate(y_train):
        if label == 0:
            samples0.append(x_train_tf[i])
    s = smote.Smote(np.array(samples0), N=600)
    over_samplings_x = s.over_sampling()
    total_samplings_x = np.row_stack((x_train_tf, over_samplings_x))
    total_samplings_y = np.concatenate(
        (y_train, np.zeros(len(over_samplings_x))), axis=0)
    #model=train_model(x_train_tf, y_train,tf.transform(fenci(x_test)),y_test)
    model = train_model(total_samplings_x, total_samplings_y,
                        tf.transform(fenci(x_test)), y_test)
    y_predict = model.predict(tf.transform(fenci((x_test))))
    comment1 = "一如既往的好。已经快成了陆家嘴上班的我的食堂了。满减活动非常给力,上次叫了八样东西,折扣下来居然就六十左右,吃得好爽好爽。南瓜吃过几次,就一次不够酥烂,其他几次都很好。烤麸非常入味,适合上海人。鱼香肉丝有点辣,下饭刚好。那个蔬菜每次都点。总体很好吃。"
    comment2 = "糯米外皮不绵滑,豆沙馅粗躁,没有香甜味。12元一碗不值。"
    print(predict(model, pd.Series([comment1]), tf))
    print(predict(model, pd.Series([comment2]), tf))
Ejemplo n.º 4
0
plt.title("Eigen Values of the Principal Components")
plt.xlim((0, 30))

# %%
PCA_data.columns = ("PC" + str(i) for i in range(1, 8))
print("Shape of the Feature Matrix after PCA is:", PCA_data.shape)
print("PVE of the chosen PC's are:", pc_analyser.calc_PVE(m=7))
PCA_data = pd.concat([clean_Y, PCA_data], axis=1)

# %% [markdown]
# ### SMOTE

# %%
minority = PCA_data[PCA_data["Bankrupt?"] ==
                    1]  # Extract minority samples from data
smt = smote.Smote(minority.to_numpy())  # Initialize the SMOTE class
oversamples = smt.oversample(N=2600)  # Employ SMOTE oversampling

# %%
smote_data = PCA_data.copy(
    deep=True)  # Cleared from outliers and dim reduced by PCA. Now oversample
oversamples_pd = pd.DataFrame(oversamples, columns=PCA_data.columns)
smote_data = smote_data.append(oversamples_pd)
smote_data = smote_data.reset_index(drop=True)

# %%
unstable_smote = (smote_data["Bankrupt?"] == 1).sum()
stable_smote = (smote_data["Bankrupt?"] == 0).sum()
print("Oversampled Data Size:", smote_data.shape[0])
print("Number of Stable Companies:", stable_smote)
print("Number of Unstable Companies (with SMOTE):", unstable_smote)