-
Notifications
You must be signed in to change notification settings - Fork 0
/
enova.py
96 lines (75 loc) · 3.57 KB
/
enova.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#SQL vs pandas tute https://medium.com/jbennetcodes/how-to-rewrite-your-sql-queries-in-pandas-and-more-149d341fc53e
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt #Data visualisation libraries
import seaborn as sns
#! pip install pandasql
import pandasql as ps
tr=pd.read_csv(r'D:\enova\Assessment_Files\Assessment_Files\training.csv')
tr.info()
a=tr.describe() #descriptive statistics only numerical variables
tr.columns
#clean variable with characters other than 0-9 and .
tr['Measurements2'] = tr['Measurements'].str.replace(r'[^.0-9]', ',')
#split the Measurements variable in to 3 columns
tr['length'],tr['width'],tr['hight']=tr['Measurements2'].str.split(',',2).str
cols=['length','width','hight']
tr[cols]=tr[cols].apply(pd.to_numeric)
del cols
del (tr['Measurements'], tr['Measurements2'])
tr['Profit']=tr['Retail']-tr['Price']
tr['Profit_flg'] = pd.np.where(tr['Profit']>=0,1,0)
sns.pairplot(tr)
sns.distplot(tr['Price'])
sns.distplot(tr['Retail'])
plt.box(tr['Retail'])
plt.scatter(tr['Carats'],tr['Price'])
plt.xlabel('Retail')
plt.ylabel('Price')
plt.title('Scatter Plot of R vs P')
sns.catplot(x="Vendor", y="Price", hue="Polish1", kind="swarm", data=tr)
corr=tr.corr() #correlation
# plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
tr['Cart'] = pd.cut(tr.Carats,
bins=[0, 0.5, 1, 1.5, 2, 3, 4, 5,15],
labels=['0_0.5', '0.5_1', '1-1.5', '1.5-2', '2-3','3-4','4-5','5+'],
include_lowest=True)
final=pd.crosstab(tr.Vendor, tr.Cart,tr.Price, aggfunc="mean")
final
final2=pd.crosstab(tr.Cert, tr.Cart,tr.Profit_flg, aggfunc="sum")
final2
q = """SELECT Cert , Profit_flg ,avg(Profit) as Avg_Profit ,count(0) as freq FROM tr group by Cert , Profit_flg;"""
print(ps.sqldf(q, locals()))
q = """SELECT Vendor , Cart ,avg(Price) as Avg_Pr ,count(0) as freq FROM tr group by Vendor, Cart;"""
print(ps.sqldf(q, locals()))
q = """SELECT Shape ,count(0) as count FROM tr group by Shape;"""
print(ps.sqldf(q, locals()))
#remove white space
tr['Shape1'] = tr['Shape'].str.replace(r'\s+', '')
tr['Shape2'] = pd.np.where(tr.Shape1.str.contains('Marquis|Marwuise'),'Marquise',
pd.np.where(tr.Shape1.str.contains('ROUND'),'Round',tr['Shape1']))
tr['Shape']=tr['Shape2']
del (tr['Shape1'], tr['Shape2'])
q1="""SELECT Cert ,Vendor ,count(0) as count FROM tr group by Cert, Vendor;"""
print(ps.sqldf(q1, locals()))
tr['Cert']=tr.Cert.fillna('None')
tr['Clarity1'] = pd.np.where(tr.Clarity.str.contains('N'),'None',tr['Clarity'])
tr['Color']=tr["Color"].str[0:1] #remove variations of the main color
tr.Fluroescence1.unique()
tr['Fluroescence1'] = pd.np.where(tr['Fluroescence']==' ','None',tr['Fluroescence'] )
tr['Fluroescence1'] = pd.np.where(tr['Fluroescence1']=='','None',tr['Fluroescence1'] )
tr['Fluroescence1'] = tr.Fluroescence1.fillna('None') #change NAN values to some other value
tr['Fluroescence1'] = pd.np.where(tr['Fluroescence']==' ',np.nan,tr['Fluroescence'] )
tr['Fluroescence1'] = pd.np.where(tr['Fluroescence1']=='',np.nan,tr['Fluroescence1'] )
tr['Polish1'] = pd.np.where(tr['Polish']==' ','None',tr['Polish'] )
X=tr[['length',]]
Y=tr[['Price']]
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X,Y)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
y = tr["Profit_flg"]
X = tr[["Cert","Cart","Cut","Shape","Table","Symmetry","Regions","Shape","Vendor","Polish1"]]
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
dt.fit(X, y)