def predict(selected_columns):
        fit_model = joblib.load(model_file_name)

        predict_set = pd.read_csv(csvdata, names=COLUMNS,
                                  skipinitialspace=True, skiprows=1)
        del predict_set[LABEL]

        #
        predict_set_imputed = deepcopy(predict_set)

        remove_list = list(set(COLUMNS) - set(selected_columns))
        predict_set_imputed = predict_set_imputed.drop(
            remove_list + ['marital', 'job', 'contact'], axis=1)

        predict_set_imputed['education'] = ternary_vectorizing(
            predict_set_imputed['education'],
            ['primary', 'secondary', 'tertiary'])
        predict_set_imputed['education'].replace('unknown', np.nan,
                                                 inplace=True)
        predict_set_imputed.fillna(predict_set_imputed.mean(),
                                   inplace=True)
        predict_set_imputed = one_hot(predict_set_imputed,
                                      list(set(CATEGORICAL_COLUMNS_2).intersection(
                                          set(selected_columns))))

        predict_set_sc_scaled_imputed = standard_scaler.fit_transform(
            predict_set_imputed)
        predict_set_sc_scaled_imputed = pd.DataFrame(
            predict_set_sc_scaled_imputed)

        print 'predict_set_sc_scaled_imputed', list(
            predict_set_sc_scaled_imputed.columns.values)

        label_pred = fit_model.predict(predict_set_sc_scaled_imputed)
        i = 0
        label_pred = ['y'] + label_pred.tolist()
        result = []

        with open(file_name, 'r') as csvinput:
            for row in csv.reader(csvinput):
                if i == 0:
                    result.append(COLUMNS)
                else:
                    row[-1] = 'no' if label_pred[i] == 0 else 'yes'
                    result.append(row)
                i += 1

        with open("output.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(result)

        out = None
        with open('output.csv', 'r') as f:
            reader = csv.DictReader(f)
            out = [row for row in reader]

        return model_name, out
    def validate(selected_columns):
        fit_model = joblib.load(model_file_name)

        validation_set = pd.read_csv(csvdata, names=COLUMNS,
                                     skipinitialspace=True, skiprows=1)
        validation_label_set = deepcopy(validation_set[LABEL])
        del validation_set[LABEL]

        #
        validation_set_imputed = deepcopy(validation_set)
        remove_list = list(set(COLUMNS) - set(selected_columns))
        validation_set_imputed = validation_set_imputed.drop(
            remove_list + ['marital', 'job', 'contact'], axis=1)
        validation_set_imputed['education'] = ternary_vectorizing(
            validation_set_imputed['education'],
            ['primary', 'secondary', 'tertiary'])
        validation_set_imputed['education'].replace('unknown', np.nan,
                                                    inplace=True)
        validation_set_imputed.fillna(validation_set_imputed.mean(),
                                      inplace=True)
        validation_set_imputed = one_hot(validation_set_imputed,
                                         list(set(CATEGORICAL_COLUMNS_2).intersection(
                                             set(selected_columns))))
        #
        validation_label_set = binary_vectorizing(validation_label_set,
                                                  ['no', 'yes'])
        #
        validation_set_sc_scaled_imputed = standard_scaler.fit_transform(
            validation_set_imputed)
        validation_set_sc_scaled_imputed = pd.DataFrame(
            validation_set_sc_scaled_imputed)

        print 'validation_set_sc_scaled_imputed', list(
            validation_set_sc_scaled_imputed.columns.values)

        label_pred = fit_model.predict(validation_set_sc_scaled_imputed)

        cnf_matrix = confusion_matrix(validation_label_set, label_pred)
        np.set_printoptions(precision=2)
        print model_name, cnf_matrix

        tp, fp, fn, tn = confusion_matrix(validation_label_set, label_pred).ravel()
        validate_result = {'TrueNegative': tn, 'FalsePositive': fp, 'FalseNegative': fn, 'TruePositive': tp}

        return model_name, validate_result
Example #3
0
training_set = pd.read_csv(BANK_TRAINING,
                           names=COLUMNS,
                           skipinitialspace=True,
                           skiprows=1)
training_label_set = deepcopy(training_set[LABEL])
del training_set[LABEL]

#
training_set_imputed = deepcopy(training_set)
training_set_imputed = training_set_imputed.drop(['marital', 'job', 'contact'],
                                                 axis=1)
training_set_imputed['education'] = ternary_vectorizing(
    training_set_imputed['education'], ['primary', 'secondary', 'tertiary'])
training_set_imputed['education'].replace('unknown', np.nan, inplace=True)
training_set_imputed.fillna(training_set_imputed.mean(), inplace=True)
training_set_imputed = one_hot(training_set_imputed, CATEGORICAL_COLUMNS_2)
#
training_label_set = binary_vectorizing(training_label_set, ['no', 'yes'])
#
training_set_sc_scaled_imputed = standard_scaler.fit_transform(
    training_set_imputed)
training_set_sc_scaled_imputed = pd.DataFrame(training_set_sc_scaled_imputed)
#

##########################
validation_set = pd.read_csv(BANK_TESTING,
                             names=COLUMNS,
                             skipinitialspace=True,
                             skiprows=1)
validation_label_set = deepcopy(validation_set[LABEL])
del validation_set[LABEL]
    def train(selected_columns):
        training_set = pd.read_csv(csvdata, names= COLUMNS,
                                   skipinitialspace=True, skiprows=1)
        training_label_set = deepcopy(training_set[LABEL])
        del training_set[LABEL]

        training_set_imputed = deepcopy(training_set)
        remove_list = list(set(COLUMNS) - set(selected_columns))
        training_set_imputed = training_set_imputed.drop(remove_list + ['marital', 'job', 'contact'], axis=1)
        training_set_imputed['education'] = ternary_vectorizing(
            training_set_imputed['education'],
            ['primary', 'secondary', 'tertiary'])
        training_set_imputed['education'].replace('unknown', np.nan,
                                                  inplace=True)
        training_set_imputed.fillna(training_set_imputed.mean(), inplace=True)
        training_set_imputed = one_hot(training_set_imputed,
                                       list(set(CATEGORICAL_COLUMNS_2).intersection(set(selected_columns))))
        #
        training_label_set = binary_vectorizing(training_label_set,
                                                ['no', 'yes'])
        #
        training_set_sc_scaled_imputed = standard_scaler.fit_transform(
            training_set_imputed)
        training_set_sc_scaled_imputed = pd.DataFrame(
            training_set_sc_scaled_imputed)

        fit_model = MODELS[model_name].fit(training_set_sc_scaled_imputed,
                                           training_label_set)
        # print fit_model.feature_importances_
        importances = []
        feature_importance_output = ""
        if (model_name == "rf"):
            importances = fit_model.feature_importances_
            feature_importance_output = username + "_rf.json"
        elif (model_name == "lm"):
            importances = fit_model.coef_[0]
            feature_importance_output = username + "_lm.json"

        if (len(importances)):
            feature_names = training_set_imputed.keys()
            indices = np.argsort(importances)[::-1]
            df = pd.DataFrame(columns=['features', 'importance'])
            for f in range(training_set_sc_scaled_imputed.shape[1]):
                df.loc[f] = [feature_names[indices[f]], importances[indices[f]]]
            f_dict = {}
            for index, row in df.iterrows():
                key = row['features'].split('_', 1)[0]
                print key, row['importance']
                if key in f_dict:
                    f_dict[key] += row['importance']
                else:
                    f_dict[key] = row['importance']
            print f_dict
            if feature_importance_output:
                with open(feature_importance_output, 'w') as f2:
                    json.dump(f_dict, f2)

        joblib.dump(fit_model, model_file_name)

        print model_name, model_file_name
        if not os.path.exists('existingmodel.json'):
            open('existingmodel.json', 'a').close()

        with open('existingmodel.json', 'r') as f:

            jd = {}

            try:
                jd = json.load(f)
            except:
                pass


            with open('existingmodel.json', 'w') as f1:
                jd[model_file_name] = selected_columns
                json.dump(jd, f1)

        if (not feature_importance_output):
            return model_name, fit_model.get_params()
        else:
            return model_name, fit_model.get_params(), feature_importance_output