def update_db(model): new_events = pd.DataFrame() for i in collection.find({'prob': {'$exists': False}}): event = pd.DataFrame.from_dict(i, orient='index').T event['object_id'] = int(event['object_id']) new_events = new_events.append(event) new_events = new_events.reset_index() features = Features() X = features.features_clean(new_events) X['prob'] = model.predict_proba(X)[:, 1] X['prob'] = pd.Series( pd.cut(X['prob'], bins=[0.0, 0.3, 0.6, 1.0], labels=['Low Risk', 'Medium Risk', 'High Risk'])) X['object_id'] = pd.DataFrame(new_events['object_id']).astype('object') X_dict = X.to_dict('records') for i in X_dict: collection.find_one_and_update({'object_id': i['object_id']}, {"$set": { 'prob': i['prob'] }}) return None
import pandas as pd import numpy as np from class_features import Features from class_models import NaiveBayes from bs4 import BeautifulSoup from sklearn.model_selection import train_test_split df = pd.read_json('data/data.json', convert_dates=[ 'approx_payout_date', 'event_created', 'event_published', 'event_start', 'event_end', 'user_created' ]) #Features features = Features() X, y = features.features_clean(df) # ============================================================================= # Naive Bayes model # ============================================================================= def parse_text(X): parsed_text = [] for idx, row in X.iteritems(): soup = BeautifulSoup(X.loc[idx], 'html.parser') texts = soup.findAll(text=True) text_lst = list(texts) document = " ".join(text_lst) document = document.replace('\n', '') document = document.replace('\t', '') parsed_text.append(document)
from class_models import Linear, GLM import scipy.stats as scs metadata = pd.read_excel('data/Data Dictionary.xlsx') Train = pd.read_csv('data/Train.csv') Test = pd.read_csv('data/Test.csv') #Target variable histograms fig, axs=plt.subplots(1,2) axs[0].hist(Train['SalePrice']) axs[0].set_title('Sale Price') axs[1].hist(np.log(Train['SalePrice'])) axs[1].set_title('Logs of Sale Price') #Features X=Features().features_clean(Train) y = Train['SalePrice'] #Linear model with log transformed target variable linear = Linear(X,np.log(y)) cv_linear = linear.cv_mse(5) linear.resid_scatter(X,np.log(y)) linear.model_summary() #GLM Gamma family with log-link glm = GLM(X,y) cv_glm = glm.cv_mse(5) glm.resid_scatter(X,y) glm.model_summary() # Predictions Test data