Ejemplo n.º 1
0
def get_feature_upsampling():
    df = pd.read_csv("/home/liyulian/websafetyL/data/fraud/creditcard.csv")
    df['normAmount'] = StandardScaler().fit_transform(
        df['Amount'].values.reshape(-1, 1))
    df = df.drop(['Time', 'Amount'], axis=1)

    y = df['Class']
    features = df.drop(['Class'], axis=1).columns
    x = df[features]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4)

    print("raw data")
    print(pd.value_counts(y_train))

    os = SMOTE(random_state=0)
    x_train_1, y_train_1 = os.fit_sample(x_train, y_train)
    print("Smote data")
    print(pd.value_counts(y_train_1))

    return x_train, x_test, y_train, y_test
Ejemplo n.º 2
0
def get_feature_upsampling():
    df = pd.read_csv("../data/fraud/creditcard.csv")
    df['normAmount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
    df = df.drop(['Time', 'Amount'], axis=1)

    y = df['Class']
    features = df.drop(['Class'], axis=1).columns
    x = df[features]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4)

    print "raw data"
    print pd.value_counts(y_train)

    os = SMOTE(random_state=0)
    x_train_1,y_train_1=os.fit_sample(x_train,y_train)
    print "Smote data"
    print pd.value_counts(y_train_1)


    return x_train, x_test, y_train, y_test
Ejemplo n.º 3
0
#         a. With all features from RFE
#         b. With select features from RFE
#     2) Regular model

# ### Model 1a: Over-Sampling (All Features)

# In[41]:

os = SMOTE(random_state=0)
Xceo_train, Xceo_test, yceo_train, yceo_test = train_test_split(Xceo,
                                                                yceo,
                                                                test_size=0.5,
                                                                random_state=0)
columns = Xceo_train.columns

os_ceo_X, os_ceo_y = os.fit_sample(Xceo_train, yceo_train)
os_ceo_X = pd.DataFrame(data=os_ceo_X, columns=columns)
os_ceo_y = pd.DataFrame(data=os_ceo_y, columns=['label'])

# In[42]:

print("length of oversampled ceos is ", len(os_ceo_X))
print("Number of non-CEOs in oversampled ceos",
      len(os_ceo_y[os_ceo_y['label'] == 0]))
print("Number of CEOs", len(os_ceo_y[os_ceo_y['label'] == 1]))
print("Proportion of non-ceos in oversampled ceos is ",
      len(os_ceo_y[os_ceo_y['label'] == 0]) / len(os_ceo_X))
print("Proportion of ceos in oversampled ceos is ",
      len(os_ceo_y[os_ceo_y['label'] == 1]) / len(os_ceo_X))

# In[43]:
Ejemplo n.º 4
0
np.sum(y_train == True)
# 16391
np.sum(y_train == False)
# 1504449
np.sum(y_test == True)
# 7490
np.sum(y_test == False)
# 689628

train_col_names = X_train.columns

# over-sampling using SMOTE-Synthetic Minority Oversampling Technique
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
os_data_X, os_data_y = os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X, columns=train_col_names)
os_data_y = pd.DataFrame(data=os_data_y, columns=['y'])

# check the lengths of data now
os_data_X.shape
# (2996702, 55)
len(os_data_y)
# 2996702
# percent of True
n_total = len(os_data_y)
n_true = sum(os_data_y['y'] == True)
n_true
# 1498351 (before oversampling: 23881)

n_false = sum(os_data_y['y'] == False)
Ejemplo n.º 5
0
    def __init__(self, getfile, test_num):

        #-----SPLIT DATASETS-------
        self.getfile = getfile
        self.test_num = test_num

        tested = pd.read_csv(self.getfile)
        x = tested.iloc[:, [5, 6]].values
        # output
        y = tested.iloc[:, 7].values
        xtrain, xtest, ytrain, ytest = train_test_split(
            x, y, test_size=self.test_num, random_state=42)
        print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)
        #testx = len(xtest)
        #print(testx)
        #4 Feature Scaling

        #Feature Scaling or Standardization: It is a step of Data Pre Processing which is applied to independent variables or features of data.
        # It basically helps to normalise the data within a particular range. Sometimes, it also helps in speeding up the calculations in an algorithm.

        sc_x = StandardScaler()
        xtrain = sc_x.fit_transform(np.asarray(xtrain))
        xtest = sc_x.transform(np.asarray(xtest))

        counter = Counter(y)

        #---------------SMOTE ALGORITHM--------------------------

        print("Before OverSampling, counts of label '1': {}\n".format(
            sum(ytrain == 1)))
        print("Before OverSampling, counts of label '-1': {} \n".format(
            sum(ytrain == -1)))
        print('WITH SMOTE')

        os = RandomOverSampler(sampling_strategy='minority')
        xtrain_res, ytrain_res = os.fit_sample(x, y)
        oversample = SMOTE()

        xtrain, ytrain = oversample.fit_resample(xtrain_res,
                                                 ytrain_res.ravel())

        counter = Counter(ytrain)
        print(counter)
        print('After OverSampling, the shape of train_X: {}'.format(
            xtrain.shape))
        print('After OverSampling, the shape of train_y: {} \n'.format(
            ytrain.shape))

        print("After OverSampling, counts of label '1': {}".format(
            sum(ytrain == 1)))
        print("After OverSampling, counts of label '-1': {}".format(
            sum(ytrain == -1)))

        #---------------LOGISTIC REGRESSION----------------------
        #5 Fitting the Logistic Regression to the Training Set:
        #We create a classifier object of LR class

        classifier = LogisticRegression()

        #Fit logistic regression model to the training set (Xtrain and ytrain)
        classifier.fit(xtrain, ytrain)
        #vget = classifier.vard
        #print(vget)

        #6 Predicting the Test set results
        #Using predict method for the classifier object and put Xtest for #argument
        y_pred = classifier.predict(xtest)
        #print(y_pred)
        posed = 1
        neued = 1
        neged = 1

        import MySQLdb

        mydb = MySQLdb.connect(host="127.0.0.1",
                               user="******",
                               password="",
                               database="logitregression_data")
        mycursor = mydb.cursor()
        logit = []
        with open('temp_file.csv', 'r') as tempo:
            read = csv.reader(tempo, delimiter=',')

            for tem in read:
                logit.append(tem)

        with open(getfile, 'r') as file:
            reader = csv.reader(file, delimiter=',')
            all_value = []
            counter = 0

            mycursor.execute("DELETE FROM hybrid_logitval")

            #-----------The Result On The Logistic Regression Process Based on the Number of Test size will be seperated and determine the overall Result--------------
            for over in y_pred:
                counter += 1
                if over == 1:
                    posed += 1

                    resu = 'Positive'
                    regval = 1
                elif over == 0:
                    neued += 1

                    resu = 'Neutral'
                    regval = 0
                else:
                    neged += 1

                    resu = 'Negative'
                    regval = -1
                #stregval = str(regval)
                #valued = (counter,over,stregval, resu)

                query2 = "INSERT INTO `hybrid_logitval`(`HYB_ID`, `HYB_VALUE`, `HYB_SENTIMENT`, `HYB_RESULT`) VALUES (%s,%s,%s,%s)"
                mycursor.execute(query2,
                                 (counter, logit[counter], regval, resu))

            for row in reader:
                #print(row[0])
                value = (row[0], row[1], row[2], row[3], row[4], row[5],
                         row[6], row[7])
                all_value.append(value)

        mycursor.execute("DELETE FROM `baseline`")

        query = "INSERT INTO `baseline`(`ID`, `TWEETS`, `TOKENIZED`, `STOP_WORDS`, `STEMMED`, `POLARITY`, `SUBJECTIVITY`, `SENTIMENT`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)"

        mycursor.executemany(query, all_value)
        mycursor.execute("DELETE FROM `baseline` WHERE `baseline`.`ID` = 0")

        mydb.commit()
        mydb.close()

        #---------------CONFUSION MATRIX----------------------
        #7 Making the Confusion Matrix. It contains the correct and incorrect predictions of our model

        #ytest parameter will be y_test
        #y_pred is the logistic regression model prediction
        cm = confusion_matrix(ytest, y_pred)
        import warnings
        warnings.filterwarnings("ignore")
        cr = classification_report(ytest, y_pred)
        print(ytest)

        print("Confusion Matrix : \n", cm)
        print(cr)

        import mlxtend.plotting
        from mlxtend.plotting import plot_confusion_matrix
        class_names = ['-1', '0', '1']
        fig, ax = plot_confusion_matrix(conf_mat=cm,
                                        colorbar=True,
                                        class_names=class_names)
        fig.canvas.set_window_title('HYBRID LOGISTIC REGRESSION')
        plt.ylabel('Actual label')
        plt.xlabel('Predicted label')
        plt.show()

        #-------SENDS ALL VALUES TO APPEAR ON THE USER INTERFACE----------------
        global accurate, confuse, posi, neut, nega, overall, plots, replot, percentage, reports
        accurate = accuracy_score(ytest, y_pred)
        print(accurate)
        percentage = "{:.0%}".format(accurate)
        confuse = cm
        print(percentage)
        posi = posed
        neut = neued
        nega = neged
        plots = y_pred
        replot = plt
        reports = cr

        if (neut >= posi) and (neut >= nega):
            overall = 'NEUTRAL'
        elif (posi >= neut) and (posi >= nega):
            overall = 'POSITIVE'
        else:
            overall = 'NEGATIVE'

        print(overall)
#1520840
np.sum(y_train == True)
# 16391
np.sum(y_train == False)
# 1504449
np.sum(y_test == True)
# 7490
np.sum(y_test == False)
# 689628

train_col_names = X_train.columns

# over-sampling using SMOTE-Synthetic Minority Oversampling Technique
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_data_X, os_data_y = os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X, columns=train_col_names)
os_data_y = pd.DataFrame(data=os_data_y, columns=['y'])

# check the lengths of data now
os_data_X.shape
# (2996702, 55)
len(os_data_y)
# 2996702
# percent of True
n_total = len(os_data_y)
n_true = sum(os_data_y['y']==True)
n_true
# 1498351 (before oversampling: 23881)

n_false = sum(os_data_y['y']==False)
Ejemplo n.º 7
0
telcom = telcom.drop(columns=num_cols, axis=1)
telcom = telcom.merge(scaled, left_index=True, right_index=True, how="left")

from imblearn.over_sampling import SMOTE
cols = [i for i in telcom.columns if i not in Id_col + target_col]

smote_X = telcom[cols]
smote_Y = telcom[target_col]

#Split train and test data
smote_train_X, smote_test_X, smote_train_Y, smote_test_Y = train_test_split(
    smote_X, smote_Y, test_size=.25, random_state=111)

#oversampling minority class using smote
os = SMOTE(random_state=0)
os_smote_X, os_smote_Y = os.fit_sample(smote_train_X, smote_train_Y)
os_smote_X = pd.DataFrame(data=os_smote_X, columns=cols)
os_smote_Y = pd.DataFrame(data=os_smote_Y, columns=target_col)

#splitting train and test data
train, test = train_test_split(telcom, test_size=.25, random_state=111)

##seperating dependent and independent variables
cols = [i for i in telcom.columns if i not in Id_col + target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X = test[cols]
test_Y = test[target_col]

# # 3. Common function for model prediction
Ejemplo n.º 8
0
# In[19]:


X = X[columns1]
#y = y[target]
print(X.shape)
print(y.shape)


# In[20]:


OVERSAMPLING = True
if OVERSAMPLING:
    os = RandomOverSampler()
    X_res,y_res=os.fit_sample(X,y)
else:
    X_res = X
    y_res = y


# In[21]:


#split into train and validation data
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X_res, y_res, test_size = 0.2, random_state = 0, stratify = y_res)
print(X_train.shape, X_val.shape)

# If oversampling works, these should both print 0.5
print(np.average(Y_train))
Ejemplo n.º 9
0
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from imblearn.over_sampling import RandomOverSampler

train_data = data[data['TT']=='train']
x = train_data.drop(['TT','student_id','grant_amount','class_rank'],axis = 1)
y = train_data['grant_amount']

test_data = data[data['TT']=='test']
test_x = test_data.drop(['TT','student_id','grant_amount','class_rank'],axis = 1)
test_y = test_data['grant_amount']

os =  RandomOverSampler(ratio=1.0)
X_overs, y_overs = os.fit_sample(x, y)

data_train, data_test, target_train, target_test = cross_validation.train_test_split(X_overs, y_overs)

xgb1 = XGBClassifier(
    learning_rate=0.02,
    n_estimators=820,
    max_depth=3,
    min_child_weight=0.5,
    gamma=0.01,
    subsample=0.7,
    colsample_bytree=0.7,
    colsample_bylevel=0.6,
    objective='multi:softmax',
    seed=10,
    nthread=8,
Ejemplo n.º 10
0
#  Our dataset is strongly imbalanced (~4000 accepted vs 57000 not-accepted)
#  Use oversampling to generate pseudodata
#  Take 20% of dataset as test-set, use the rest for training
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=1)
columns = X_train.columns

X_test_orig = X_test
y_test_orig = y_test

#  Create oversampled dataset
os = SMOTE(random_state=1)
os_data_X_train, os_data_y_train = os.fit_sample(X_train,
                                                 y_train.values.ravel())
os_data_X_train = pd.DataFrame(data=os_data_X_train, columns=columns)
os_data_y_train = pd.DataFrame(data=os_data_y_train,
                               columns=['request_status'])

os_data_X_test, os_data_y_test = os.fit_sample(X_test, y_test.values.ravel())
os_data_X_test = pd.DataFrame(data=os_data_X_test, columns=columns)
os_data_y_test = pd.DataFrame(data=os_data_y_test, columns=['request_status'])

#  Plausibility check for oversampling
print(
    "\nApply oversampling to get equal ratio of acceptance/non-acceptance:\n")
print("New length of our oversampled dataset is ", len(os_data_X_train))
print("Number of non-acceptance in oversampled dataset",
      len(os_data_y_train[os_data_y_train['request_status'] == 0]))
print("Number of acceptance in oversampled dataset",
Ejemplo n.º 11
0
if(b>0.5):
  print('Churn')
else:
  print('No Churn')

from sklearn.metrics import confusion_matrix , classification_report

print(classification_report(y_test,y_pred))

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

os=SMOTETomek()
X_train_ns,y_train_ns=os.fit_sample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

model1 = Sequential([
    Dense(19, input_shape=(19,), activation='relu'),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

rs=model1.fit(X_train_ns, y_train_ns, epochs=200,validation_data = (X_test,y_test))
                                                    random_state=0)

## Scaling(seperate for train and test set)

scaler = preprocessing.StandardScaler()
scaled_values_train = scaler.fit_transform(X_train)
scaled_values_test = scaler.fit_transform(X_test)

X_scaled = scaled_values_train
columns = X.columns
X_test_scaled = pd.DataFrame(data=scaled_values_test, columns=columns)

##Over-sampling training data using SMOTE
os = SMOTE(random_state=0)
columns = X.columns
os_X_train, os_y_train = os.fit_sample(X_scaled, y_train)
os_data_X = pd.DataFrame(data=os_X_train, columns=columns)
os_data_y = pd.DataFrame(data=os_y_train, columns=['y'])

# Oversampling report
print("\nBalancing data with synthetic data..")
print("\nLength of synthetic training data:", (len(os_data_X) - len(X_train)))
print("Length of original training data:", len(X_train))
print("Length of oversampled training data:", len(os_data_X))
print("Proportion of negative examples in original data:",
      round(len(y_train[y_train == 0]) / len(y_train), 2))
print("Proportion of negative examples in oversampled data:",
      len(os_data_y[os_data_y['y'] == 0]) / len(os_data_X))

# Set the parameters by cross-validation