Beispiel #1
0
 def random_forest():
     return ml_technique.MLTechnique(name='Random Forest Predictor', data_file=data_file,
                                     model=ensemble.RandomForestClassifier(),
                                     features=features,
                                     ml_code='random',
                                     pickle_file=pickle_file,
                                     nlp=nlp,
                                     data_frame=data_frame)
Beispiel #2
0
 def k_neighbours():
     return ml_technique.MLTechnique(name='k-Nearest Neighbours Predictor', data_file=data_file,
                                     model=neighbors.KNeighborsClassifier(),
                                     features=features,
                                     ml_code='k',
                                     pickle_file=pickle_file,
                                     nlp=nlp,
                                     data_frame=data_frame)
Beispiel #3
0
 def naive():
     return ml_technique.MLTechnique(name='Naive Bayes Predictor', data_file=data_file,
                                     model=naive_bayes.GaussianNB(),
                                     features=features,
                                     ml_code='naive',
                                     pickle_file=pickle_file,
                                     nlp=nlp,
                                     data_frame=data_frame)
Beispiel #4
0
 def log_reg():
     return ml_technique.MLTechnique(name='Logistic Regression Predictor', data_file=data_file,
                                     model=linear_model.LogisticRegression(),
                                     features=features,
                                     ml_code='log_reg',
                                     pickle_file=pickle_file,
                                     nlp=nlp,
                                     data_frame=data_frame)
Beispiel #5
0
 def d_tree():
     return ml_technique.MLTechnique(name='Decision Tree Predictor', data_file=data_file,
                                     model=tree.DecisionTreeClassifier(),
                                     features=features,
                                     ml_code='d_tree',
                                     pickle_file=pickle_file,
                                     nlp=nlp,
                                     data_frame=data_frame)
Beispiel #6
0
 def svm_tech():
     return ml_technique.MLTechnique(name='SVM Predictor', data_file=data_file,
                                     model=svm.SVC(kernel='linear', C=1, gamma=1),
                                     features=features,
                                     ml_code='svm',
                                     pickle_file=pickle_file,
                                     nlp=nlp,
                                     data_frame=data_frame)
Beispiel #7
0
def calculate_statistics(input_path, output_path, features, forced=False):
    file = pd.ExcelFile(input_path)
    stat_data = file.parse('Balance', index_col='jobid')
    file.close()

    stat_nlp = read_instance_pickle('pickle/stat_nlp.pickle')
    if stat_nlp is None:
        stat_nlp = NLPTechnique(name='Stat NLP',
                                nlp_vader=True,
                                nlp_bow=True,
                                pp_lemma=True,
                                pp_pos=True,
                                pickle_file='pickle/stat_nlp.pickle')
    stat_data = stat_nlp.process_data(stat_data, forced=forced)
    attr = features + ['category']
    stat_data = stat_data[attr]
    # stat_data[features] = stat_data[features].apply(pd.to_numeric, axis='columns')
    # stat_data = stat_data.fillna(method='ffill')
    stat.bucketing(stat_data, output_path)

    svm_tech = read_instance_pickle('pickle/svm_tech.pickle')
    dt_tech = read_instance_pickle('pickle/dt_tech.pickle')
    log_reg_tech = read_instance_pickle('pickle/log_reg_tech.pickle')
    k_tech = read_instance_pickle('pickle/k_tech.pickle')
    nb_tech = read_instance_pickle('pickle/nb_tech.pickle')
    rf_tech = read_instance_pickle('pickle/rf_tech.pickle')

    if None in (svm_tech, dt_tech, log_reg_tech, k_tech, nb_tech, rf_tech):
        # Technique instances
        print("cannot find instances")
        svm_tech = ml_technique.MLTechnique(name='Support Vector Machine (w SA)',
                                            data_file='artifacts/sel_data/master.xlsx',
                                            model=svm.SVC(kernel='linear', C=1, gamma=1),
                                            pickle_file='pickle/svm_tech.pickle',
                                            features=features)
        dt_tech = ml_technique.MLTechnique(name='Decision Tree (w SA)',
                                           data_file='artifacts/sel_data/master.xlsx',
                                           model=tree.DecisionTreeClassifier(),
                                           pickle_file='pickle/dt_tech.pickle',
                                           features=features)
        log_reg_tech = ml_technique.MLTechnique(name='Logistic Regression (w SA)',
                                                data_file='artifacts/sel_data/master.xlsx',
                                                model=linear_model.LogisticRegression(),
                                                pickle_file='pickle/log_reg_tech.pickle',
                                                features=features)
        k_tech = ml_technique.MLTechnique(name='k-Nearest Neighbour (w SA)',
                                          data_file='artifacts/sel_data/master.xlsx',
                                          model=neighbors.KNeighborsClassifier(),
                                          pickle_file='pickle/k_tech.pickle',
                                          features=features)
        nb_tech = ml_technique.MLTechnique(name='Naive Bayes (w SA)',
                                           data_file='artifacts/sel_data/master.xlsx',
                                           model=naive_bayes.GaussianNB(),
                                           pickle_file='pickle/nb_tech.pickle',
                                           features=features)
        rf_tech = ml_technique.MLTechnique(name='Random Forest (w SA)',
                                           data_file='artifacts/sel_data/master.xlsx',
                                           model=ensemble.RandomForestClassifier(),
                                           pickle_file='pickle/rf_tech.pickle',
                                           features=features)

    # DUP
    elif getattr(svm_tech, 'features') != features:
        print('instance has different features')
        svm_tech = ml_technique.MLTechnique(name='Support Vector Machine (w SA)',
                                            data_file='artifacts/sel_data/master.xlsx',
                                            model=svm.SVC(kernel='linear', C=1, gamma=1),
                                            pickle_file='pickle/svm_tech.pickle',
                                            features=features)
        dt_tech = ml_technique.MLTechnique(name='Decision Tree (w SA)',
                                           data_file='artifacts/sel_data/master.xlsx',
                                           model=tree.DecisionTreeClassifier(),
                                           pickle_file='pickle/dt_tech.pickle',
                                           features=features)
        log_reg_tech = ml_technique.MLTechnique(name='Logistic Regression (w SA)',
                                                data_file='artifacts/sel_data/master.xlsx',
                                                model=linear_model.LogisticRegression(),
                                                pickle_file='pickle/log_reg_tech.pickle',
                                                features=features)
        k_tech = ml_technique.MLTechnique(name='k-Nearest Neighbour (w SA)',
                                          data_file='artifacts/sel_data/master.xlsx',
                                          model=neighbors.KNeighborsClassifier(),
                                          pickle_file='pickle/k_tech.pickle',
                                          features=features)
        nb_tech = ml_technique.MLTechnique(name='Naive Bayes (w SA)',
                                           data_file='artifacts/sel_data/master.xlsx',
                                           model=naive_bayes.GaussianNB(),
                                           pickle_file='pickle/nb_tech.pickle',
                                           features=features)
        rf_tech = ml_technique.MLTechnique(name='Random Forest (w SA)',
                                           data_file='artifacts/sel_data/master.xlsx',
                                           model=ensemble.RandomForestClassifier(),
                                           pickle_file='pickle/rf_tech.pickle',
                                           features=features)

    # Calculate statistics
    svm_a = stat.get_accuracy(svm_tech, forced=forced)
    dt_a = stat.get_accuracy(dt_tech, forced=forced)
    log_reg_a = stat.get_accuracy(log_reg_tech, forced=forced)
    k_a = stat.get_accuracy(k_tech, forced=forced)
    nb_a = stat.get_accuracy(nb_tech, forced=forced)
    rf_a = stat.get_accuracy(rf_tech, forced=forced)

    svm_f = stat.get_f1_score(svm_tech, forced=forced)
    dt_f = stat.get_f1_score(dt_tech, forced=forced)
    log_reg_f = stat.get_f1_score(log_reg_tech, forced=forced)
    k_f = stat.get_f1_score(k_tech, forced=forced)
    nb_f = stat.get_f1_score(nb_tech, forced=forced)
    rf_f = stat.get_f1_score(rf_tech, forced=forced)

    # Add time stamp to the result
    tt = datetime.datetime.now().strftime('%H:%M %d/%m/%Y')

    print("prediction done")
    return (
        {
            'svm': svm_a,
            'dt': dt_a,
            'log_reg': log_reg_a,
            'k': k_a,
            'nb': nb_a,
            'rf': rf_a
        },
        {
            'svm': svm_f,
            'dt': dt_f,
            'log_reg': log_reg_f,
            'k': k_f,
            'nb': nb_f,
            'rf': rf_f
        },
        tt
    )
Beispiel #8
0
def ml_analysis():
    category_column = 'category'
    category_map = {
        "Detractor": 0,
        "Passive": 1,
        "Promoter": 2
    }

    # Get the saved ML techniques
    svm_tech = read_instance_pickle('pickle/svm_tech.pickle')
    dt_tech = read_instance_pickle('pickle/dt_tech.pickle')
    log_reg_tech = read_instance_pickle('pickle/log_reg_tech.pickle')
    k_tech = read_instance_pickle('pickle/k_tech.pickle')
    nb_tech = read_instance_pickle('pickle/nb_tech.pickle')
    rf_tech = read_instance_pickle('pickle/rf_tech.pickle')

    # If the ML technique instances does not exist, create new instances
    if None in (svm_tech, dt_tech, log_reg_tech, k_tech, nb_tech, rf_tech):
        # Technique instances
        print("cannot find instances")
        features = ['neu', 'neg', 'pos', 'compound', 'bag_vector', 'on_time_in_full', 'deliveryday', 'region']
        svm_tech = ml_technique.MLTechnique(name='Support Vector Machine (w SA)',
                                            data_file='artifacts/sel_data/master.xlsx',
                                            model=svm.SVC(kernel='linear', C=1, gamma=1),
                                            pickle_file='pickle/svm_tech.pickle',
                                            features=features)
        dt_tech = ml_technique.MLTechnique(name='Decision Tree (w SA)',
                                           data_file='artifacts/sel_data/master.xlsx',
                                           model=tree.DecisionTreeClassifier(),
                                           pickle_file='pickle/dt_tech.pickle',
                                           features=features)
        log_reg_tech = ml_technique.MLTechnique(name='Logistic Regression (w SA)',
                                                data_file='artifacts/sel_data/master.xlsx',
                                                model=linear_model.LogisticRegression(),
                                                pickle_file='pickle/log_reg_tech.pickle',
                                                features=features)
        k_tech = ml_technique.MLTechnique(name='k-Nearest Neighbour (w SA)',
                                          data_file='artifacts/sel_data/master.xlsx',
                                          model=neighbors.KNeighborsClassifier(),
                                          pickle_file='pickle/k_tech.pickle',
                                          features=features)
        nb_tech = ml_technique.MLTechnique(name='Naive Bayes (w SA)',
                                           data_file='artifacts/sel_data/master.xlsx',
                                           model=naive_bayes.GaussianNB(),
                                           pickle_file='pickle/nb_tech.pickle',
                                           features=features)
        rf_tech = ml_technique.MLTechnique(name='Random Forest (w SA)',
                                           data_file='artifacts/sel_data/master.xlsx',
                                           model=ensemble.RandomForestClassifier(),
                                           pickle_file='pickle/rf_tech.pickle',
                                           features=features)

    # Cross Validation
    svm_cv = svm_tech.cross_validate(cat_col=category_column, cat_map=category_map)
    dt_cv = dt_tech.cross_validate(cat_col=category_column, cat_map=category_map)
    log_reg_cv = log_reg_tech.cross_validate(cat_col=category_column, cat_map=category_map)
    k_cv = k_tech.cross_validate(cat_col=category_column, cat_map=category_map)
    nb_cv = nb_tech.cross_validate(cat_col=category_column, cat_map=category_map)
    rf_cv = rf_tech.cross_validate(cat_col=category_column, cat_map=category_map)

    # Local "packaging/parsing" of cross validation results to be rendered
    def analyze_result(name, cv_result):
        cm = confusion_matrix(cv_result['true_array'], cv_result['predict_array'])

        return {
            'name': name,
            'labels': cv_result['categories'],
            'matrix': cm,
            'analysis': {
                'input_size': cv_result['total_size']
            }
        }

    return {
        'techniques': ['svm', 'dt', 'log_reg', 'k', 'nb', 'rf'],
        'results': {
            'svm': analyze_result(svm_tech.get_name(), svm_cv),
            'dt': analyze_result(dt_tech.get_name(), dt_cv),
            'log_reg': analyze_result(log_reg_tech.get_name(), log_reg_cv),
            'k': analyze_result(k_tech.get_name(), k_cv),
            'nb': analyze_result(nb_tech.get_name(), nb_cv),
            'rf': analyze_result(rf_tech.get_name(), rf_cv)
        }
    }