def random_forest(): return ml_technique.MLTechnique(name='Random Forest Predictor', data_file=data_file, model=ensemble.RandomForestClassifier(), features=features, ml_code='random', pickle_file=pickle_file, nlp=nlp, data_frame=data_frame)
def k_neighbours(): return ml_technique.MLTechnique(name='k-Nearest Neighbours Predictor', data_file=data_file, model=neighbors.KNeighborsClassifier(), features=features, ml_code='k', pickle_file=pickle_file, nlp=nlp, data_frame=data_frame)
def naive(): return ml_technique.MLTechnique(name='Naive Bayes Predictor', data_file=data_file, model=naive_bayes.GaussianNB(), features=features, ml_code='naive', pickle_file=pickle_file, nlp=nlp, data_frame=data_frame)
def log_reg(): return ml_technique.MLTechnique(name='Logistic Regression Predictor', data_file=data_file, model=linear_model.LogisticRegression(), features=features, ml_code='log_reg', pickle_file=pickle_file, nlp=nlp, data_frame=data_frame)
def d_tree(): return ml_technique.MLTechnique(name='Decision Tree Predictor', data_file=data_file, model=tree.DecisionTreeClassifier(), features=features, ml_code='d_tree', pickle_file=pickle_file, nlp=nlp, data_frame=data_frame)
def svm_tech(): return ml_technique.MLTechnique(name='SVM Predictor', data_file=data_file, model=svm.SVC(kernel='linear', C=1, gamma=1), features=features, ml_code='svm', pickle_file=pickle_file, nlp=nlp, data_frame=data_frame)
def calculate_statistics(input_path, output_path, features, forced=False): file = pd.ExcelFile(input_path) stat_data = file.parse('Balance', index_col='jobid') file.close() stat_nlp = read_instance_pickle('pickle/stat_nlp.pickle') if stat_nlp is None: stat_nlp = NLPTechnique(name='Stat NLP', nlp_vader=True, nlp_bow=True, pp_lemma=True, pp_pos=True, pickle_file='pickle/stat_nlp.pickle') stat_data = stat_nlp.process_data(stat_data, forced=forced) attr = features + ['category'] stat_data = stat_data[attr] # stat_data[features] = stat_data[features].apply(pd.to_numeric, axis='columns') # stat_data = stat_data.fillna(method='ffill') stat.bucketing(stat_data, output_path) svm_tech = read_instance_pickle('pickle/svm_tech.pickle') dt_tech = read_instance_pickle('pickle/dt_tech.pickle') log_reg_tech = read_instance_pickle('pickle/log_reg_tech.pickle') k_tech = read_instance_pickle('pickle/k_tech.pickle') nb_tech = read_instance_pickle('pickle/nb_tech.pickle') rf_tech = read_instance_pickle('pickle/rf_tech.pickle') if None in (svm_tech, dt_tech, log_reg_tech, k_tech, nb_tech, rf_tech): # Technique instances print("cannot find instances") svm_tech = ml_technique.MLTechnique(name='Support Vector Machine (w SA)', data_file='artifacts/sel_data/master.xlsx', model=svm.SVC(kernel='linear', C=1, gamma=1), pickle_file='pickle/svm_tech.pickle', features=features) dt_tech = ml_technique.MLTechnique(name='Decision Tree (w SA)', data_file='artifacts/sel_data/master.xlsx', model=tree.DecisionTreeClassifier(), pickle_file='pickle/dt_tech.pickle', features=features) log_reg_tech = ml_technique.MLTechnique(name='Logistic Regression (w SA)', data_file='artifacts/sel_data/master.xlsx', model=linear_model.LogisticRegression(), pickle_file='pickle/log_reg_tech.pickle', features=features) k_tech = ml_technique.MLTechnique(name='k-Nearest Neighbour (w SA)', data_file='artifacts/sel_data/master.xlsx', model=neighbors.KNeighborsClassifier(), pickle_file='pickle/k_tech.pickle', features=features) nb_tech = ml_technique.MLTechnique(name='Naive Bayes (w SA)', data_file='artifacts/sel_data/master.xlsx', model=naive_bayes.GaussianNB(), pickle_file='pickle/nb_tech.pickle', features=features) rf_tech = ml_technique.MLTechnique(name='Random Forest (w SA)', data_file='artifacts/sel_data/master.xlsx', model=ensemble.RandomForestClassifier(), pickle_file='pickle/rf_tech.pickle', features=features) # DUP elif getattr(svm_tech, 'features') != features: print('instance has different features') svm_tech = ml_technique.MLTechnique(name='Support Vector Machine (w SA)', data_file='artifacts/sel_data/master.xlsx', model=svm.SVC(kernel='linear', C=1, gamma=1), pickle_file='pickle/svm_tech.pickle', features=features) dt_tech = ml_technique.MLTechnique(name='Decision Tree (w SA)', data_file='artifacts/sel_data/master.xlsx', model=tree.DecisionTreeClassifier(), pickle_file='pickle/dt_tech.pickle', features=features) log_reg_tech = ml_technique.MLTechnique(name='Logistic Regression (w SA)', data_file='artifacts/sel_data/master.xlsx', model=linear_model.LogisticRegression(), pickle_file='pickle/log_reg_tech.pickle', features=features) k_tech = ml_technique.MLTechnique(name='k-Nearest Neighbour (w SA)', data_file='artifacts/sel_data/master.xlsx', model=neighbors.KNeighborsClassifier(), pickle_file='pickle/k_tech.pickle', features=features) nb_tech = ml_technique.MLTechnique(name='Naive Bayes (w SA)', data_file='artifacts/sel_data/master.xlsx', model=naive_bayes.GaussianNB(), pickle_file='pickle/nb_tech.pickle', features=features) rf_tech = ml_technique.MLTechnique(name='Random Forest (w SA)', data_file='artifacts/sel_data/master.xlsx', model=ensemble.RandomForestClassifier(), pickle_file='pickle/rf_tech.pickle', features=features) # Calculate statistics svm_a = stat.get_accuracy(svm_tech, forced=forced) dt_a = stat.get_accuracy(dt_tech, forced=forced) log_reg_a = stat.get_accuracy(log_reg_tech, forced=forced) k_a = stat.get_accuracy(k_tech, forced=forced) nb_a = stat.get_accuracy(nb_tech, forced=forced) rf_a = stat.get_accuracy(rf_tech, forced=forced) svm_f = stat.get_f1_score(svm_tech, forced=forced) dt_f = stat.get_f1_score(dt_tech, forced=forced) log_reg_f = stat.get_f1_score(log_reg_tech, forced=forced) k_f = stat.get_f1_score(k_tech, forced=forced) nb_f = stat.get_f1_score(nb_tech, forced=forced) rf_f = stat.get_f1_score(rf_tech, forced=forced) # Add time stamp to the result tt = datetime.datetime.now().strftime('%H:%M %d/%m/%Y') print("prediction done") return ( { 'svm': svm_a, 'dt': dt_a, 'log_reg': log_reg_a, 'k': k_a, 'nb': nb_a, 'rf': rf_a }, { 'svm': svm_f, 'dt': dt_f, 'log_reg': log_reg_f, 'k': k_f, 'nb': nb_f, 'rf': rf_f }, tt )
def ml_analysis(): category_column = 'category' category_map = { "Detractor": 0, "Passive": 1, "Promoter": 2 } # Get the saved ML techniques svm_tech = read_instance_pickle('pickle/svm_tech.pickle') dt_tech = read_instance_pickle('pickle/dt_tech.pickle') log_reg_tech = read_instance_pickle('pickle/log_reg_tech.pickle') k_tech = read_instance_pickle('pickle/k_tech.pickle') nb_tech = read_instance_pickle('pickle/nb_tech.pickle') rf_tech = read_instance_pickle('pickle/rf_tech.pickle') # If the ML technique instances does not exist, create new instances if None in (svm_tech, dt_tech, log_reg_tech, k_tech, nb_tech, rf_tech): # Technique instances print("cannot find instances") features = ['neu', 'neg', 'pos', 'compound', 'bag_vector', 'on_time_in_full', 'deliveryday', 'region'] svm_tech = ml_technique.MLTechnique(name='Support Vector Machine (w SA)', data_file='artifacts/sel_data/master.xlsx', model=svm.SVC(kernel='linear', C=1, gamma=1), pickle_file='pickle/svm_tech.pickle', features=features) dt_tech = ml_technique.MLTechnique(name='Decision Tree (w SA)', data_file='artifacts/sel_data/master.xlsx', model=tree.DecisionTreeClassifier(), pickle_file='pickle/dt_tech.pickle', features=features) log_reg_tech = ml_technique.MLTechnique(name='Logistic Regression (w SA)', data_file='artifacts/sel_data/master.xlsx', model=linear_model.LogisticRegression(), pickle_file='pickle/log_reg_tech.pickle', features=features) k_tech = ml_technique.MLTechnique(name='k-Nearest Neighbour (w SA)', data_file='artifacts/sel_data/master.xlsx', model=neighbors.KNeighborsClassifier(), pickle_file='pickle/k_tech.pickle', features=features) nb_tech = ml_technique.MLTechnique(name='Naive Bayes (w SA)', data_file='artifacts/sel_data/master.xlsx', model=naive_bayes.GaussianNB(), pickle_file='pickle/nb_tech.pickle', features=features) rf_tech = ml_technique.MLTechnique(name='Random Forest (w SA)', data_file='artifacts/sel_data/master.xlsx', model=ensemble.RandomForestClassifier(), pickle_file='pickle/rf_tech.pickle', features=features) # Cross Validation svm_cv = svm_tech.cross_validate(cat_col=category_column, cat_map=category_map) dt_cv = dt_tech.cross_validate(cat_col=category_column, cat_map=category_map) log_reg_cv = log_reg_tech.cross_validate(cat_col=category_column, cat_map=category_map) k_cv = k_tech.cross_validate(cat_col=category_column, cat_map=category_map) nb_cv = nb_tech.cross_validate(cat_col=category_column, cat_map=category_map) rf_cv = rf_tech.cross_validate(cat_col=category_column, cat_map=category_map) # Local "packaging/parsing" of cross validation results to be rendered def analyze_result(name, cv_result): cm = confusion_matrix(cv_result['true_array'], cv_result['predict_array']) return { 'name': name, 'labels': cv_result['categories'], 'matrix': cm, 'analysis': { 'input_size': cv_result['total_size'] } } return { 'techniques': ['svm', 'dt', 'log_reg', 'k', 'nb', 'rf'], 'results': { 'svm': analyze_result(svm_tech.get_name(), svm_cv), 'dt': analyze_result(dt_tech.get_name(), dt_cv), 'log_reg': analyze_result(log_reg_tech.get_name(), log_reg_cv), 'k': analyze_result(k_tech.get_name(), k_cv), 'nb': analyze_result(nb_tech.get_name(), nb_cv), 'rf': analyze_result(rf_tech.get_name(), rf_cv) } }