/
Codes.py
318 lines (241 loc) · 10.4 KB
/
Codes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 25 11:00:35 2020
@author: Prachi Palvi 75291019025
"""
#%%
"""
Shill Bidding Dataset Data Set
-Shill bidding is when someone bids
on an item to artificially increase its price,
desirability, or search standing.
-could create an unfair advantage,
or cause another bidder to pay more than they should.
- The data set we have is labelled data set, we can use
(Supervised Learning) classification techniques to identify fraudulent shill bidders
with - abnormal bidding behaviour or otherwise the normal bidders
- we will be using three classification models
1) Naive Bayes
2) Logistic Regression
3) Decesion Tree classifier
"""
#%%
#importing required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,auc,roc_auc_score
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
#%% Exploratory data analysis
#importing data set
SBdata=pd.read_csv("C:\\Users\\Admin\\Desktop\\Msc ASA sem 3\\SC 5\\ICA exam\\Shill_Bidding_Dataset.csv")
SBdata.info
SBdata.shape # the dimensions of data set
"""
from this we know the first three variables are unique identification variables
are categorical and of no use for our model.
The class variable is our target variable(dependent) and variables 4 - 12 are
independent variables contributing to the class variable.
"""
# removing unnecessary categorical variables
SBdata1=SBdata.drop(['Record_ID','Auction_ID','Bidder_ID'],axis=1)
summary=SBdata1.describe() #summary
" summary tells us that all variables are floats except for Auction_Duration "
# lets check the ratio of frauds v/s normal bidders
print("Class as pie chart:")
fig, ax = plt.subplots(1, 1)
ax.pie(SBdata1.Class.value_counts(),autopct='%1.1f%%', labels=['Genuine','Fraud'], colors=['yellowgreen','r'])
plt.axis('equal')
plt.ylabel('')
# so there are 10.7 % fraud bidders
#plot of variables to see the variable wise significant difference for genuine and fraud bidders
print("Bidder Tendency")
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(SBdata1.Bidder_Tendency[SBdata1.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(SBdata1.Bidder_Tendency[SBdata1.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Bidder Tendency')
plt.ylabel('Bids')
"""
the above plot shows that genuine bidder tendency ranges from 0 to 0.2
while a fraud ranges 0 to 1
"""
print("Bidding Ratio")
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(SBdata1.Bidding_Ratio[SBdata1.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(SBdata1.Bidding_Ratio[SBdata1.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Bidding Ratio')
plt.ylabel('Bids')
"""
the above plot shows that genuine bidding ranges from 0 to 0.2
while a fraud ranges 0.1 to 0.8
"""
print("Successive Outbidding")
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(SBdata1.Successive_Outbidding[SBdata1.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(SBdata1.Successive_Outbidding[SBdata1.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Sucessive Outbidding')
plt.ylabel('Bids')
"""
the discription of variable tells us that a shill bidder succesively outbids himself
even though he is the current winner to increase the price gradually with small consecutive increments
so from the plot we can see a fraud/shill bidder will have successive outbidding value 0.5 or 1.0
where as a genuine bidder will always have 0
"""
print("Last bidding")
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(SBdata1.Last_Bidding[SBdata1.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(SBdata1.Last_Bidding[SBdata1.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Last bidding')
plt.ylabel('Bids') ##### similar results
print("Auction_Bids")
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(SBdata1.Auction_Bids[SBdata1.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(SBdata1.Auction_Bids[SBdata1.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Auction_Bids')
plt.ylabel('Bids') ##### similar results
print("Auction Starting Price")
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(SBdata1.Starting_Price_Average[SBdata1.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(SBdata1.Starting_Price_Average[SBdata1.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Starting_Price_Average')
plt.ylabel('Bids') ##### similar results
print("Early Bidding")
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(SBdata1.Early_Bidding[SBdata1.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(SBdata1.Early_Bidding[SBdata1.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Early Bidding')
plt.ylabel('Bids') ##### similar results
print("Winning_Ratio")
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(SBdata1.Winning_Ratio[SBdata1.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(SBdata1.Winning_Ratio[SBdata1.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Winning_Ratio')
plt.ylabel('Bids')
"""
so a normal behaviour bidding has majority winning ratio value 0 where has
fraud would have winning ratio ranging from 0.7 to 1
"""
print("Auction_Duration")
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(SBdata1.Auction_Duration[SBdata1.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(SBdata1.Auction_Duration[SBdata1.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Auction_Duration')
plt.ylabel('Bids') ##### similar results
"""
so the conclusion is the variables -
Bidder Tendency, Bidding Ratio, Successive Outbidding, and winning ratio,
are significantly different for the target variable class(0 and 1).
where as the distrubution of other variables is same
for both the class types(0 - normal bidding behavior and 1 - fraud)
"""
#the distribution can be visualized using this aswell
import matplotlib.gridspec as gridspec
gs = gridspec.GridSpec(28, 1)
plt.figure(figsize=(6,28*4))
for i, col in enumerate(SBdata1[SBdata1.iloc[:,0:9].columns]):
ax5 = plt.subplot(gs[i])
sns.distplot(SBdata1[col][SBdata1.Class == 1], bins=50, color='r')
sns.distplot(SBdata1[col][SBdata1.Class == 0], bins=50, color='g')
ax5.set_xlabel('')
ax5.set_title('feature: ' + str(col))
plt.show()
#%% model building
# so we futher train a model specifically on the significant variables
SBdata2=SBdata1[['Bidder_Tendency','Bidding_Ratio','Successive_Outbidding','Winning_Ratio','Class']]
SBdata2.head
#now we split the data into train and test sets
y = SBdata2['Class'].values #target
X = SBdata2.drop(['Class'],axis=1).values #features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
print("train-set size: ", len(y_train),
"\ntest-set size: ", len(y_test))
print("fraud cases in test-set: ", sum(y_test))
X_train
X_test
y_train
y_test
#%%
## Creating a pipeline function for common classifier algorithm
def get_predictions(clf, X_train, y_train, X_test):
# create classifier
clf = clf
# fit it to training data
clf.fit(X_train,y_train)
# predict using test data
y_pred = clf.predict(X_test)
# Compute predicted probabilities: y_pred_prob
y_pred_prob = clf.predict_proba(X_test)
#for fun: train-set predictions
train_pred = clf.predict(X_train)
print('train-set confusion matrix:\n', confusion_matrix(y_train,train_pred))
return y_pred, y_pred_prob
## function to get classifiers score
def print_scores(y_test,y_pred,y_pred_prob):
print('test-set confusion matrix:\n', confusion_matrix(y_test,y_pred))
print("recall score: ", recall_score(y_test,y_pred))
print("precision score: ", precision_score(y_test,y_pred))
print("f1 score: ", f1_score(y_test,y_pred))
print("accuracy score: ", accuracy_score(y_test,y_pred))
print("ROC AUC: {}".format(roc_auc_score(y_test, y_pred_prob[:,1])))
#%%
# training a naive bayes model for classification
y_pred, y_pred_prob = get_predictions(GaussianNB(), X_train, y_train, X_test)
print_scores(y_test,y_pred,y_pred_prob)
# Accuracy = 96.91 %
# hence we can see that the model has correclty classified all the 135 values as frauds/ shill bidders
#%%
# training a logistic regression model
y_pred, y_pred_prob = get_predictions(LogisticRegression(C = 0.01, penalty = 'l1'), X_train, y_train, X_test)
print_scores(y_test,y_pred,y_pred_prob)
# Accuracy = 96.28 %
# Recall Score is also low 0.666
# how ever it is not the same for this LR model, further that can be improvised using undersampled data
#%%
" Decision Tree Classifier "
# training Decision tree
y_pred, y_pred_prob = get_predictions(DecisionTreeClassifier(max_depth=4), X_train, y_train, X_test)
# by using max_depth=4 we perform pre pruning, highest accuracy at 4
print_scores(y_test,y_pred,y_pred_prob)
# hence Decesion tree classifier has the heighest accuracy of 98.33 %
# we can visualize a decision tree using the following code.
clf = DecisionTreeClassifier()
clf2=clf.fit(X_test,y_pred)
tree.plot_tree(clf2)
#%%
""" CONCLUSION
It is important to classify or predict which bidder is a shill bidder and is influencing
bidding prices in a fraudulent way. Hence we built these models using a labelled data set.
The data set was properly analysed and checked for trends, missing values and
unnecessary variables and was cleaned accordingly before modelling.
We trained models for Naive bayes, Logistic Regression and Decesion Tree classifiers.
The accuracy for Decesion Tree classifier model is highest hence
it would be best to use for future predictions.
Further using this data we can try and learn Unsupervised learning methods like
clustering for labelling the data set.
"""
#%% Thank you!