def check_email(): # set script to run at intervals (time.sleep?) - morn, mid-day, afternoon current_time = datetime.now() previous_check = cpm.unpickle(os.path.join(pkl_dir, 'last_check_time.pkl')) print previous_check emails = cpm.get_emails('INBOX', previous_check) if not emails: # avoid unpickling if empty inbox return feature_model = cpm.unpickle(os.path.join(pkl_dir, 'final_vec.pkl')) classifier_model = cpm.unpickle(os.path.join(pkl_dir, 'final_model.pkl')) print 'classifier model', classifier_model for email in emails: if email.sent_at > previous_check: clean_b = cpm.clean_raw_txt(email.body) clean_s = cpm.clean_raw_txt(email.subject) print 'clean_email', clean_s, clean_b email_features = feature_model.transform([clean_b+clean_s]) print 'email_bag', email_features.shape classifier_result = classifier_model.predict(email_features) eval_email(classifier_result, email) cpm.pickle(current_time, os.path.join(pkl_dir,'last_check_time.pkl'))
def main(model_name, X_test=None, y_test=None, models=None, save=False, train_fn='train_split.pkl', model_fn='final_model.pkl'): #model_names = ['LogisticRegression', 'MultinomialNB', 'SVC', 'RandomForest', 'GradientBoost'] if models is None: models = [cpm.unpickle(os.path.join(pkl_dir, model_fn))] if X_test is None: X_train, X_test, y_train, y_test = cpm.unpickle(os.path.join(pkl_dir, train_fn)) X_test = X_test.todense() for i, model in enumerate(models): model_eval(model_name[i], model, X_test, y_test, save) plot_roc_curve(model_name[:len(models)], models, X_test, y_test, save)
def main(model_names, save=False, train_fn='train_split.pkl', model_fn='final_model.pkl', new_data_split=False, random=11): if new_data_split: X_train, X_test, y_train, y_test = cpm.create_datasets(save=save, vectorize=True, random=random) else: X_train, X_test, y_train, y_test = cpm.unpickle(os.path.join(pkl_dir, train_fn)) X_train = X_train.todense() #model_names = ['LogisticRegression', 'MultinomialNB', 'SVC', 'RandomForest', 'GradientBoost'] class_model = { 'LogisticRegression': LogisticRegression(C=10000.0 , penalty='l2', class_weight='auto', fit_intercept=True), 'MultinomialNB': MultinomialNB(alpha=0.100000, fit_prior=True), 'SVC': SVC(C=10000.0, kernel='linear', shrinking=True, probability=True, class_weight='auto'), 'RandomForest': RandomForestClassifier(n_estimators=121, criterion='entropy', bootstrap=True, oob_score=True, max_features='auto'), 'GradientBoost': GradientBoostingClassifier() } model_results = [] for model in model_names: model_results.append(setup_model(model, class_model[model], X_train, y_train, save)) return model_results
def get_feature_names(vectorizer=None, vec_fn='final_vec.pkl'): if vectorizer == None: vectorizer = cpm.unpickle(os.path.join(pkl_dir, vec_fn)) return vectorizer.get_feature_names()