def train_model(): df = pd.read_csv('static/datasets/diamonds.csv') cut_le, color_le, clarity_le = le(), le(), le() df['cut'] = cut_le.fit_transform(df['cut']) df['color'] = color_le.fit_transform(df['color']) df['clarity'] = clarity_le.fit_transform(df['clarity']) pickle.dump(cut_le, open('static/saved_models/cut_le.pkl', 'wb')) pickle.dump(color_le, open('static/saved_models/color_le.pkl', 'wb')) pickle.dump(clarity_le, open('static/saved_models/clarity_le.pkl', 'wb')) df.drop(['depth', 'table'], axis=1, inplace=True) X = df.drop('price', axis=1) y = df.price X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) model= RandomForestRegressor() model = model.fit(X_train, y_train) pickle.dump(model, open('static/saved_models/randomforestregressor.pkl', 'wb')) return 'model was built sucessfully'
def predict(): trained_model = e1.get() clf = pickle.load(open(trained_model, 'rb')) data_name = e2.get() df_test = pd.read_csv(data_name) s = int(e4.get()) if s != 0: df_test.drop(df_test.columns[s-1], axis=1, inplace=True) s = int(e5.get()) if s != 0: df_test.drop(df_test.columns[s-1], axis=1, inplace=True) no_of_rows, no_of_col = df_test.shape s = e6.get() if s != '0': for column in df_test: df_test = df_test[~df_test[column].isin([s, 'NaN'])] df_test.reset_index(drop=True, inplace=True) no_of_rows, no_of_col = df_test.shape for column in df_test: if str(type(df_test[column][0])).replace('<class ', '').replace('>','') == '\'str\'': try: df_test[column] = df_test[column].apply(lambda i: clean_float(i)) except ValueError: encoder = le() encoder.fit(df_test[column]) df_test[column] = encoder.transform(df_test[column]) no_of_rows, no_of_col = df_test.shape x = np.array(df_test) x = preprocessing.scale(x) y = [] for row in x: y.append(clf.predict([row])) print(y) pred_file = e3.get() with open(pred_file, 'w') as f: for item in y: f.write("%s\n" % item) f.close()
def analyse(): from pandas.plotting import scatter_matrix req = request.get_json() user_id = req["params"]["user_id"] project_id = req["params"]["project_id"] filename = req["params"]["filename"] fullPath = user_id + "/" + project_id + "/" + filename dataset_file = read_file(fullPath) if (dataset_file == None): return apierrors.ErrorMessage("dataset not found") file = StringIO(dataset_file.decode('utf-8')) dataset = pd.read_csv(file) if "label_encode" in req: dataset = pd.read_csv(file, dtype="unicode") dataset = dataset.apply(le().fit_transform) dataset = dataset.fillna(0) hp = plt.subplot() dataset.hist(ax=hp, figsize=(12, 12)) dp = dataset.plot(kind='density') bp = dataset.plot(kind='box') sm = scatter_matrix(dataset, figsize=(12, 12)) resultset = { "plot": write_base64_img(user_id, project_id, "plot.png", plot(dataset.plot())), "hp_plot": write_base64_img(user_id, project_id, "hp.png", plot(hp)), "dp_plot": write_base64_img(user_id, project_id, "dp.png", plot(dp)), "bp_plot": write_base64_img(user_id, project_id, "bp.png", plot(bp)), "sm_plot": write_base64_img(user_id, project_id, "sm.png", plot(sm[0][0])) } return json.dumps(resultset)
def run(): df = pd.read_csv(e1.get()) s = int(e2.get()) if s != 0: df.drop(df.columns[s-1], axis=1, inplace=True) s = int(e3.get()) if s != 0: df.drop(df.columns[s-1], axis=1, inplace=True) no_of_rows, no_of_col = df.shape s = e4.get() if s != '0': for column in df: df = df[~df[column].isin([s, 'NaN'])] df.reset_index(drop=True, inplace=True) no_of_rows, no_of_col = df.shape for column in df: if str(type(df[column][0])).replace('<class ', '').replace('>','') == '\'str\'': try: df[column] = df[column].apply(lambda i: clean_float(i)) except ValueError: encoder = le() encoder.fit(df[column]) df[column] = encoder.transform(df[column]) no_of_rows, no_of_col = df.shape p = e5.get() y = np.array((df[p])) x = np.array(df.drop(p, axis=1)) x_train, x_test, y_train, y_test = tts(x, y, test_size=0.3) x_train = preprocessing.scale(x_train) x_test = preprocessing.scale(x_test) c1 = var1.get() c2 = var2.get() if c1 == 1: clf1 = svm.SVR(kernel='rbf') clf1.fit(x_train, y_train) model_name1 = e6.get() with open(model_name1, 'wb') as f: pickle.dump(clf1, f) result.delete('1.0', END) result.insert(END, 'Squared error Accuracy for SVR: ' + str(clf1.score(x_test, y_test))+'\n') if c2 == 1: clf2 = KNR(3) clf2.fit(x_train, y_train) model_name2 = e7.get() with open(model_name2, 'wb') as f: pickle.dump(clf2, f) if c1 != 1: result.delete('1.0', END) result.insert(END, 'Squared error Accuracy for KNN: ' + str(clf2.score(x_test, y_test)))
#there is more left in department technical #salary vs left pd.crosstab(employees.salary, employees.left).plot(kind='bar') plt.title('Turnover vs salary') plt.xlabel('salary') plt.ylabel('Left') plt.savefig('salary_Left bar_chart') #Data Preprocessing #label encoder from sklearn.preprocessing import LabelEncoder as le, OneHotEncoder labelencoder_x = le() x[:, 7] = labelencoder_x.fit_transform(x[:, 7]) x[:, 8] = labelencoder_x.fit_transform(x[:, 8]) onehotencoder = OneHotEncoder(categorical_features=[7]) x = onehotencoder.fit_transform(x).toarray() #feature selection from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression model = LogisticRegression() rfe = RFE(model, 10)
# In[ ]: get_ipython().run_cell_magic( u'html', u'', u"<div class='tableauPlaceholder' id='viz1535718122614' style='position: relative'><noscript><a href='#'><img alt='Story 2 ' src='https://public.tableau.com/static/images/Ti/Titanic2_32/Story2/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='Titanic2_32/Story2' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/Ti/Titanic2_32/Story2/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='filter' value='publish=yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1535718122614'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='1016px';vizElement.style.height='991px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>" ) # **Converting categorical data to numeric form** # In[ ]: from sklearn.preprocessing import LabelEncoder as le from sklearn.preprocessing import OneHotEncoder as ohe for c in train.columns: if train[c].dtype == 'object': z1 = le().fit_transform(train[c].astype(str)) train[c] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) z1 = le().fit_transform(test[c].astype(str)) test[c] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) z1 = le().fit_transform(train['Age'].astype(str)) train['Age'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) z1 = le().fit_transform(test['Age'].astype(str)) test['Age'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) z1 = le().fit_transform(train['Fare'].astype(str)) train['Fare'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) z1 = le().fit_transform(test['Fare'].astype(str)) test['Fare'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) # **Dropping Some unnecessary Features** # cabin has more than 70% of the data missing
iris.head() # 2.1 Separate predictors and target X = iris.iloc[: , 0:4] # Predictors: First 4 columns y = iris.iloc[:, 4] # Target: Last, 5th column # 2.2 Split X and y into train and test data X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size = 0.3) X_train.shape # (105,4) X_test.shape # (45,4) y_train[:4] # 2.2 Encode y_train from object to inetger enc = le() # Create an instance of class labelencoder enc.fit(y_train) # Let the object learn data y_tr = enc.transform(y_train) # Let it encode y_tr # 2.3 Check mapping enc.classes_ # array(['setosa', 'versicolor', 'virginica'] # Corresponds to 0,1,2 # 2.4 Verify: enc.transform(['setosa','versicolor', 'virginica']) # 3. Start modeling # 3.1 Initialize our decision tree object. # Supply relevant parameters ct = dt( criterion="gini", # Alternative 'entropy'
X_validate[col] = X_validate[col].astype(float) skew = X_validate[col].skew() print "for ",col," skew is :",skew if skew >2 : X_validate[col] = X_validate[col] + 10 X_validate[col] = X_validate[col].apply(np.log) + 10 X[:100].to_csv("X.csv",index=False) # for col in X.columns: # print col # print X[col].dtype # print X[col].isnull().any().any() # print X[col].unique() # sys.exit(1) l = le() for col in col_to_encode: X[col] = l.fit_transform(X[col]) X_validate[col] = l.fit_transform(X_validate[col]) X["inc_by_amnt"] = (X["annual_inc"].astype(float)/X["loan_amnt"].astype(float)).apply(np.log) X["int_by_late_fee"] = X["total_rec_int"].astype(float) + X["total_rec_late_fee"].astype(float) X["amnt_*_int"] = (X["loan_amnt"].astype(float)*X["int_rate"].astype(float)).apply(np.sqrt) X.insert(0, 'payment_completion', (X['last_week_pay']/(X['term']/12*52+1))*100) X['payment_completion'] = X['payment_completion'].astype(int) X[:100].to_csv("X.csv",index=False) #sys.exit(1) X_validate["inc_by_amnt"] = (X_validate["annual_inc"].astype(float) / \ X_validate["loan_amnt"].astype(float)).apply(np.log) X_validate["int_by_late_fee"] = X_validate["total_rec_int"].astype(float) +\ X_validate["total_rec_late_fee"].astype(float)
#read file and split to dependant and independant dataset = pd.read_csv('Data.csv') x = dataset.iloc[:, 0:-1].values y = dataset.iloc[:, 3].values #fill nan values by mean x[:, 1:] = sip(missing_values=np.nan, strategy='mean').fit_transform(x[:, 1:]) #check dataset print("dataset:\n", dataset) #encode x to zeroes and ones x = ct([('Country', ohe(), [0])], remainder='passthrough').fit_transform(x) #encode y to zeroes and ones y = le().fit_transform(y) #count nan """total = dataset.isnull().sum().sort_values(ascending=False) percent = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)*100 missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) missing_data.head(20) print("total :\n",missing_data) """ #take some values as training and predict output of some test cases x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0) print("x:\n", x) print("x_train before scaling:\n", x_train)
titanic_training.isnull().sum() # ##### Converting categorical variables # To build a predictive model, all independent variables need to be of a consistent form. i.e. Binary. Sex and Embarked variables are categorical and need to be encoded. # ##### Sex # In[37]: titanic_training['Sex'][0:5] # In[38]: from sklearn.preprocessing import LabelEncoder as le label_encoder = le() titanic_training['Sex'] = label_encoder.fit_transform(titanic_training['Sex']) titanic_training['Sex'][0:5] # 1 = male / 0 = female # - Sex variable has now been encoded with 1 representing male and 0 representing female # ##### Embarked # In[39]: titanic_training['Embarked'] = titanic_training['Embarked'].astype(str) # In[40]: titanic_training['Embarked'][0:6]
X_te=sc_X.transform(X_te) sc_y=ss() y_tr=sc_y.fit_transform(y_tr) regressor=lr() regressor.fit(X_tr,y_tr) y_pred=regressor.predict(X_te) plt.scatter(X_tr,y_tr,color="green") plt.plot(X_tr,regressor.predict(X_tr),color="red") plt.title("工资及年限(训练)") plt.xlabel("工作年限/年") plt.ylabel("工资") plt.show() plt.scatter(X_tr,y_tr,color="green") plt.plot(X_tr,regressor.predict(X_tr),color="red") plt.title("工资及年限(测试)") plt.xlabel("工作年限/年") plt.ylabel("工资") plt.show()''' #多元线性回归 dataset = pd.read_csv("50_Startups.csv") X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 1].values le = le() X[:, 3] = le.fit_transform(X[:, 3]) onehotencoder = OneHotEncoder(categorical_features=[3]) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] X_tr, X_te, y_tr, y_te = tts(X, y, test_size=0.3, random_state=0) regressor = lr() regressor.fit(X_tr, y_tr) y_pred = regressor.predict(X_te)
def encode(V, n_values): new_V, new_values = label_mapper(V, n_values) encoder = le() encoded_V = encoder.fit_transform(new_V) return encoded_V, new_values
import sys import pickle as pkl import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder as le if __name__ == '__main__': with open(sys.argv[1],'rb') as fh: x = pkl.load(fh) if(len(sys.argv) == 4): d1 = int(sys.argv[2]) d2 = int(sys.argv[3]) else: d1=0 d2=1 colors = le().fit_transform(x.cell_type) plt.scatter(x[d1],x[d2],c=colors,s=0.5) plt.show()
sns.pairplot(data=cust_group, hue='Gender') # In[27]: df.drop(columns=['Annual_Income_cat'], inplace=True) #Dropping columns not needed # In[28]: df # In[29]: from sklearn.preprocessing import LabelEncoder as le enc = le() # In[31]: df['Gender'] = enc.fit_transform(df['Gender']) df.head() # In[32]: find_cls = [] for i in range(1, 15): kmean = KMeans(n_clusters=i) kmean.fit(df) find_cls.append(kmean.inertia_) # In[33]:
# Dropping columns that aren't required for prediction... # In[ ]: target = train['Survived'] train = train.drop(['PassengerId', 'Name', 'Ticket'], axis=1) test = test.drop(['PassengerId', 'Name', 'Ticket'], axis=1) # Using LabelEncoding to convert categorical data to numeric data and multiplying by 10 so as to properly diffferentiate between various sub categories # In[ ]: from sklearn.preprocessing import LabelEncoder as le for col in train.columns: if train[col].dtype == 'object': train[col] = le().fit_transform(train[col].astype(str)) test[col] = le().fit_transform(test[col].astype(str)) # Adding features Family and Alone as these can be critical info for survival prediction.... # In[ ]: def alone(d): if d > 0: return 1 else: return 0 train['Family_Size'] = train['SibSp'] + train['Parch']
other_df = selector.fit_transform(other_X) other_df= pd.DataFrame(other_df) other_df #removed the variable with variance zeri ID=ID.to_frame() type(ID) y=y.to_frame() type(y) encode_test_data = test_data.select_dtypes(include=['object']) other_test_data=test_data.copy() other_test_data.drop(['X0','X1','X2','X3','X4','X5','X6','X8'],axis=1, inplace=True) selector = vt() other_test_df = selector.fit_transform(other_test_data) other_test_df= pd.DataFrame(other_test_df) encode_X = encode_X.apply(le().fit_transform) #Label encoding encode_X encode_test_data = encode_test_data.apply(le().fit_transform) X_train = pd.concat([encode_X,other_df], axis=1) X_train.notnull() X_train.isnull() X_test = pd.concat([encode_test_data,other_test_df],axis=1) X_test.notnull() X_test.isnull() X_train #applying PCA on dataset
for w in arr: try: M.append(model[w]) except: continue M = np.array(M) v = M.sum(axis=0) if type(v) != np.ndarray: return np.zeros(length + 2) else: return v / np.sqrt((v**2).sum()) # label transation from sklearn.preprocessing import LabelEncoder as le label_tool = le() y = label_tool.fit_transform(rawdata.topic.values) rawdata["y"] = y neg_label = rawdata[rawdata["topic"] == "IRRELEVANT"]['y'].iloc[0] def trans_label(x, neg_): if x == neg_: return 0 elif x < neg_: return x + 1 else: return x
from sklearn.preprocessing import StandardScaler as ss # import the dataset dataset = pd.read_csv('data\Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values # replace missing data in X using mean of the whole column imputer = im(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) # encode categorical data labelencode_X = le() X[:, 0] = labelencode_X.fit_transform(X[:, 0]) # dummy encoding the data ohotencode = ohe(categorical_features=[0]) X = ohotencode.fit_transform(X).toarray() labelencode_Y = le() y = labelencode_Y.fit_transform(y) # splitting the data into train and test set X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0) # feature scaling standardscale_X = ss() X_train = standardscale_X.fit_transform(X_train)
France 37 67000 Yes ''' dataset=pd.read_csv("data_bak.csv") X=dataset.iloc[:,:-1].values y=dataset.iloc[:,3].values #show(0) #补充缺失数据 imputer=Imputer(missing_values="NaN",strategy="mean", \ axis=0,verbose=0,copy=True) imputer=imputer.fit(X[:,1:3]) X[:,1:3]=imputer.transform(X[:,1:3]) #show(0) #给数据打上标签 le_y=le() y=le_y.fit_transform(y) le_X=le() X[:,0]=le_X.fit_transform(X[:,0]) #show(0) #哑变量编码 enc=OneHotEncoder(categorical_features=[0]) X=enc.fit_transform(X).toarray() #show(0) #划分训练集和测试集 X_tr,X_te,y_tr,y_te=tts(X,y,test_size=0.3,random_state=0) show(1) #特征缩放
no_of_rows, no_of_col = df.shape s = input("enter the NaN character") for index, row in df.iterrows(): for i in range(no_of_col): if row[i] == s: df.drop(index, axis='rows', inplace=True).reset_index(drop=True) break no_of_rows, no_of_col = df.shape def clean_float(st): st = str(st).replace(',', '') st = float(st) return st for column in df: if str(type(df[column][0])).replace('<class ', '').replace('>', '') == '\'str\'': try: df[column] = df[column].apply(lambda i: clean_float(i)) except ValueError: encoder = le() encoder.fit(df[column]) df[column] = encoder.transform(df[column]) print(df.head)
'n_estimators': [90, 100, 110], 'learning_rate': [0.1, 0.13, 0.09], 'max_depth': [5, 6, 7] } knn = {'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]} # In[71]: p = lgb(max_depth=7) # In[72]: from sklearn.preprocessing import LabelEncoder as le for c in x.columns: if x[c].dtype == 'object': x[c] = le().fit_transform(x[c].astype(str)) # In[73]: x.Age = le().fit_transform(x.Age.astype(str)) x.Fare = le().fit_transform(x.Fare.astype(str)) y.Age = le().fit_transform(y.Age.astype(str)) y.Fare = le().fit_transform(y.Fare.astype(str)) # In[74]: x = x.apply(lambda f: f.fillna(f.median())) # In[75]: xtrain, xval, ztrain, zval = tts(x, z, train_size=0.7)
""" import numpy as np import matplotlib.pyplot as plt import pandas as pd import time as t data = pd.read_csv("Churn_Modelling.csv") x = data.iloc[:, 3:13].values #all the columns except the last one is considered y = data.iloc[:, 13].values #label encoding from sklearn.preprocessing import LabelEncoder as le from sklearn.preprocessing import OneHotEncoder as ohe le_x_1 = le() #label encoder object created for country x[:, 1] = le_x_1.fit_transform( x[:, 1]) #label encoder object linked with the 2nd column of the data table le_x_2 = le() #label encoder object created for gender x[:, 2] = le_x_2.fit_transform(x[:, 2]) ohec = ohe(categorical_features=[ 1 ]) #index of the column is to be specified for the onehot encoding x = ohec.fit_transform(x).toarray() #now we have to fit the ohec object into x = x[:, 1:] #to eliminate the dummy variable trap(like for three classes a dummy variable set of 2 is fine(third is automatically set)) #data splitting from sklearn.model_selection import train_test_split as tts
''' # Write your code here import pandas as pd import numpy as np from sklearn.model_selection import train_test_split as tts from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.preprocessing import LabelEncoder as le # from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import classification_report as cr from sklearn.metrics import confusion_matrix as cm train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") # print(train.columns) # train.drop(labels='ID', axis=1, inplace=True) le2 = le() train['ID'] = le2.fit_transform(train['ID']) print('Missing Values(train): ', train.isna().sum().sum()) print('Missing Values(test): ', test.isna().sum().sum()) test.fillna(value=0, inplace=True) y = train['Result'] X = train.drop(labels='Result', axis=1) X_train, x_test, y_train, y_test = tts(X, y, test_size=0.25) # X_train,x_test,y_train,y_test = tts(X,y,test_size=0.01) clf = rfc(n_estimators=100, max_depth=5, random_state=11) clf = clf.fit(X_train, y_train) # pred=clf.predict(x_test) # print('Classification Report:',cr(y_test, pred)) # print('Confusion Metrics:\n',cm(y_test, pred)) teid = test['ID'] # test.drop(labels='ID', axis=1, inplace=True)