예제 #1
0
def update_db(model):

    new_events = pd.DataFrame()

    for i in collection.find({'prob': {'$exists': False}}):
        event = pd.DataFrame.from_dict(i, orient='index').T
        event['object_id'] = int(event['object_id'])
        new_events = new_events.append(event)

    new_events = new_events.reset_index()

    features = Features()
    X = features.features_clean(new_events)

    X['prob'] = model.predict_proba(X)[:, 1]
    X['prob'] = pd.Series(
        pd.cut(X['prob'],
               bins=[0.0, 0.3, 0.6, 1.0],
               labels=['Low Risk', 'Medium Risk', 'High Risk']))
    X['object_id'] = pd.DataFrame(new_events['object_id']).astype('object')
    X_dict = X.to_dict('records')
    for i in X_dict:
        collection.find_one_and_update({'object_id': i['object_id']},
                                       {"$set": {
                                           'prob': i['prob']
                                       }})

    return None
예제 #2
0
import pandas as pd
import numpy as np
from class_features import Features
from class_models import NaiveBayes
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

df = pd.read_json('data/data.json',
                  convert_dates=[
                      'approx_payout_date', 'event_created', 'event_published',
                      'event_start', 'event_end', 'user_created'
                  ])

#Features
features = Features()
X, y = features.features_clean(df)


# =============================================================================
# Naive Bayes model
# =============================================================================
def parse_text(X):
    parsed_text = []
    for idx, row in X.iteritems():
        soup = BeautifulSoup(X.loc[idx], 'html.parser')
        texts = soup.findAll(text=True)
        text_lst = list(texts)
        document = " ".join(text_lst)
        document = document.replace('\n', '')
        document = document.replace('\t', '')
        parsed_text.append(document)
예제 #3
0
from class_models import Linear, GLM
import scipy.stats as scs

metadata = pd.read_excel('data/Data Dictionary.xlsx')
Train = pd.read_csv('data/Train.csv')
Test = pd.read_csv('data/Test.csv')

#Target variable histograms
fig, axs=plt.subplots(1,2)
axs[0].hist(Train['SalePrice'])
axs[0].set_title('Sale Price')
axs[1].hist(np.log(Train['SalePrice']))
axs[1].set_title('Logs of Sale Price')

#Features
X=Features().features_clean(Train)
y = Train['SalePrice']

#Linear model with log transformed target variable
linear = Linear(X,np.log(y))
cv_linear = linear.cv_mse(5)
linear.resid_scatter(X,np.log(y))
linear.model_summary()

#GLM Gamma family with log-link
glm = GLM(X,y)
cv_glm = glm.cv_mse(5)
glm.resid_scatter(X,y)
glm.model_summary()

# Predictions Test data