Example #1
0
 def run(self, default_column='default payment next month'):
     woe_calc = WOE()
     uuid_t = uuid.uuid4().hex
     cr = CreditRisk(self.url, uuid_t)
     filename = cr.download_file()
     df, classes = cr.read_file(filename, uuid_t)
     X, y = cr.test_train_matrix(df)
     X_train, X_test, y_train, y_test = cr.feature_select(
         X, y, self.test_size, False)
     dtrain = xgb.DMatrix(X_train, label=y_train)
     dtest = xgb.DMatrix(X_test, label=y_test)
     num_round = 5
     evallist = [(dtest, 'eval'), (dtrain, 'train')]
     param = {
         'objective': 'binary:logistic',
         'silent': 1,
         'eval_metric': ['error', 'logloss']
     }
     bst = xgb.train(param, dtrain, num_round, evallist)
     uuid_t = cr.save_model(bst)
     y_train_pred = bst.predict(dtrain)
     y_test_pred = bst.predict(dtest)
     list_attributes = list(filename)
     if default_column is not None:
         default_col_index = list_attributes.index(default_column)
     c_nparray = filename.as_matrix()
     df_o = filename
     score = []
     woe_dict = {}
     list_attributes.pop()
     for l in list_attributes:
         woe_dict[l] = woe_calc.woe_single_x_score(
             c_nparray[:, list_attributes.index(l)],
             c_nparray[:, default_col_index].astype(bool))
     for index, row in df_o.iterrows():
         woe_val = 0
         # Assume that As the score points are just another way to denote the scorecard, they do not affect its predictive power.
         # Now, we can assume that 600 score points correspond to odds (bads to goods) of 1:50,
         #  while each additional 20 points double the odds (620 points double the odds to 1:100, 640 score points double to 1:200 etc.).
         startScore = 600
         pdo = 20
         factor = pdo / math.log(2)
         offset = round(startScore - (factor * math.log(50)))
         for l in list_attributes:
             cell_value = df_o.at[index, l]
             col_index = list_attributes.index(l)
             woe_val = woe_val + woe_dict[l][cell_value]
         score.append(round(offset + woe_val))
         print('Rows:: ', index, 'Score:: ', score[index])
     return cr.scores(score, y_train, y_train_pred, y_test, y_test_pred,
                      classes, uuid_t)
Example #2
0
    def run(self, default_column='default payment next month'):
        woe_calc = WOE()

        uuid_t = uuid.uuid4().hex
        cr = CreditRisk(self.url, uuid_t)
        filename = cr.download_file()
        df, classes = cr.read_file(filename, uuid_t)
        X, y = cr.test_train_matrix(df)
        X_train, X_test, y_train, y_test = cr.feature_select(
            X, y, self.test_size, False)
        rf = RandomForestClassifier(n_estimators=500, min_samples_leaf=5)
        rf.fit(X_train, y_train)
        uuid_t = cr.save_model(rf)
        y_train_pred = rf.predict_proba(X_train)[:, 1]
        y_test_pred = rf.predict_proba(X_test)[:, 1]

        list_attributes = list(filename)
        if default_column is not None:
            default_col_index = list_attributes.index(default_column)
        c_nparray = filename.as_matrix()
        df_o = filename
        score = []
        woe_dict = {}
        list_attributes.pop()
        for l in list_attributes:
            woe_dict[l] = woe_calc.woe_single_x_score(
                c_nparray[:, list_attributes.index(l)],
                c_nparray[:, default_col_index].astype(bool))
        for index, row in df_o.iterrows():
            woe_val = 0
            # Assume that As the score points are just another way to denote the scorecard, they do not affect its predictive power.
            # Now, we can assume that 600 score points correspond to odds (bads to goods) of 1:50,
            #  while each additional 20 points double the odds (620 points double the odds to 1:100, 640 score points double to 1:200 etc.).
            startScore = 600
            pdo = 20
            factor = pdo / math.log(2)
            offset = round(startScore - (factor * math.log(50)))
            for l in list_attributes:
                cell_value = df_o.at[index, l]
                col_index = list_attributes.index(l)
                woe_val = woe_val + \
                    woe_dict[l][cell_value] * math.log(2) / (1 - math.log(2))
            score.append(round(offset + woe_val))
            print('Rows:: ', index, 'Score:: ', score[index])
        return cr.scores(score, y_train, y_train_pred, y_test, y_test_pred,
                         classes, uuid_t)
    def run(self, default_column='default payment next month'):
        woe_calc = WOE()

        uuid_t = uuid.uuid4().hex
        cr = CreditRisk(self.url, uuid_t)
        df_o = cr.download_file()
        df, classes = cr.read_file(df_o, uuid_t)
        X, y = cr.test_train_matrix(df)
        X_train, X_test, y_train, y_test = cr.feature_select(
            X, y, self.test_size, False)
        regr = linear_model.LogisticRegression()
        regr.fit(X_train, y_train)
        uuid_t = cr.save_model(regr)
        y_train_pred = regr.predict_proba(X_train)[:, 1]
        y_test_pred = regr.predict_proba(X_test)[:, 1]
        list_attributes = list(df_o)
        # if default_column is not None:
        default_col_index = list_attributes.index(default_column)
        c_nparray = df_o.as_matrix()
        score = []
        woe_dict = {}
        list_attributes.pop()
        for l in list_attributes:
            woe_dict[l] = woe_calc.woe_single_x_score(
                c_nparray[:, list_attributes.index(l)],
                c_nparray[:, default_col_index].astype(bool))
        for index, row in df_o.iterrows():
            woe_val = 0
            # Assume that As the score points are just another way to denote the scorecard, they do not affect its predictive power.
            # Now, we can assume that 600 score points correspond to odds (bads to goods) of 1:50,
            #  while each additional 20 points double the odds (620 points double the odds to 1:100, 640 score points double to 1:200 etc.).
            startScore = 600
            pdo = 20
            factor = pdo / math.log(2)
            offset = round(startScore - (factor * math.log(50)))
            for l in list_attributes:
                cell_value = df_o.at[index, l]
                col_index = list_attributes.index(l)
                woe_val = woe_val + (
                    woe_dict[l][cell_value] * regr.coef_[0][col_index]) + (
                        regr.intercept_[0] / len(list_attributes))
            score.append(round(offset + woe_val))
        return cr.scores(score, y_train, y_train_pred, y_test, y_test_pred,
                         classes, uuid_t)
Example #4
0
    def run(self, type=None, columnName=None,default_column=None):
        cr = CreditRisk(self.url, "")
        df_train = cr.download_file()
        #df_train = pd.read_excel(filename, header=1)
        list_attributes = list(df_train)
        if default_column is not None:
            default_col_index = list_attributes.index(default_column)
        if type is None:
            cjson_list = []
            njson_list = []
            woe_calc = WOE()
            for l in list_attributes:
                x_vals = []
                y_vals = []
                x_hist = []
                y_hist = []
                if l in self.category_list:
                    df_train[l] = df_train[l].astype('category')
                    d = df_train.groupby([l], as_index=False).size()
                    d_frame = d.to_frame()

                    c_nparray = df_train.as_matrix()

                    for index, row in d_frame.iterrows():
                        x_vals.append(index)
                        y_vals.append(row[0])
                        if len(x_vals) > 200:
                            sample_list = sorted(random.sample(
                                range(0, len(x_vals)), 200))
                        else:
                            sample_list = range(0, len(x_vals))
                    categorical_json = {'feature': l,
                                        'x_vals': [str(x_vals[i]) for i in sample_list],
                                        'y_vals': [str(y_vals[i]) for i in sample_list],
                                        'woe': woe_calc.woe_single_x(c_nparray[:, list_attributes.index(l)], c_nparray[:, default_col_index].astype(bool))
                                        }
                    cjson_list.append(categorical_json)
                elif l in self.numeric_list:
                    df_train[l] = pd.to_numeric(df_train[l], errors='coerce')
                    h = np.histogram(df_train[l], bins='auto', normed = True, density=False)
                    for x in h[1].tolist():
                        x_hist.append(format(float(x), '.2f'))
                    for y in h[0].tolist():
                        y_hist.append(format(float(y), '.0f'))
                    min_val = str(df_train[l].dropna().min())
                    max_val = str(df_train[l].dropna().max())
                    mean_val = str(df_train[l].dropna().mean())
                    median_val = str(df_train[l].dropna().median())
                    mode_val = str(df_train[l].dropna().mode()[0])
                    tot_null = str(df_train[l].isnull().sum())

                    numerical_json = {'feature': l,
                                      'x_hist': x_hist,
                                      'y_hist': y_hist,
                                      'min_val': min_val,
                                      'max_val': max_val,
                                      'mean_val': mean_val,
                                      'median_val': median_val,
                                      'mode_val': mode_val,
                                      'tot_null': tot_null
                                      }
                    njson_list.append(numerical_json)
            json_final = {'categorical': cjson_list, 'numerical': njson_list}
            return json.dumps(json_final)
        else:
            l=columnName
            if type == 0:
                x_vals = []
                y_vals = []
                woe_calc = WOE()
                df_train[l] = df_train[l].astype('category')
                d = df_train.groupby([l], as_index=False).size()
                d_frame = d.to_frame()
                c_nparray = df_train.as_matrix()
                for index, row in d_frame.iterrows():
                    x_vals.append(index)
                    y_vals.append(row[0])
                    if len(x_vals) > 200:
                        sample_list = sorted(random.sample(
                            range(0, len(x_vals)), 200))
                    else:
                        sample_list = range(0, len(x_vals))
                return json.dumps({'feature': l,
                                       'x_vals': [str(x_vals[i]) for i in sample_list],
                                       'y_vals': [str(y_vals[i]) for i in sample_list],
                                       'woe': woe_calc.woe_single_x(c_nparray[:, list_attributes.index(l)], c_nparray[:, 24].astype(bool))
                                       })
            else:
                x_hist = []
                y_hist = []
              #  df_train[l] = pd.to_numeric(df_train[l], errors='coerce')
              #  h = np.histogram(df_train[l], bins='auto', normed = True, density=False)
              #  for x in h[1].tolist():
              #       x_hist.append(format(float(x), '.2f'))
              #  for y in h[0].tolist():
              #       y_hist.append(format(float(y), '.0f'))
                min_val = str(df_train[l].dropna().min())
                max_val = str(df_train[l].dropna().max())
                mean_val = str(df_train[l].dropna().mean())
                median_val = str(df_train[l].dropna().median())
                mode_val = str(df_train[l].dropna().mode()[0])
                tot_null = str(df_train[l].isnull().sum())
                return json.dumps({'feature': l,
                                                  'x_hist':df_train[l].values.tolist(),
                                                  'y_hist': y_hist,
                                                  'min_val': min_val,
                                                  'max_val': max_val,
                                                  'mean_val': mean_val,
                                                  'median_val': median_val,
                                                  'mode_val': mode_val,
                                                  'tot_null': tot_null
                                                  })
Example #5
0
    def run(self,
            col_name,
            feature_type=-1,
            default_col="DEFAULT PAYMENT NEXT MONTH"):
        filename = self.url.rsplit('/', 1)[-1]
        uuid = os.path.splitext(os.path.basename(filename))[0]
        #print("UV.url:", self.url)
        #print("UV.uuid:", uuid)
        CR = CreditRisk(self.url, uuid)
        Processing = FeatureProcessing()

        cd = os.path.dirname(os.path.abspath(__file__))
        filename = cd + "/user_aie_datasets/" + uuid + "-aie"
        filepath = Path(filename)
        if filepath.is_file():
            df_user = pd.read_csv(filename, encoding='utf-8')
        else:
            df_user = CR.download_file()
            filename = cd + "/user_datasets/" + uuid
            cd = os.path.dirname(os.path.abspath(__file__))
            df_user.to_csv(filename, encoding='utf-8', index=False)

        df_user = Processing.sort_df_by_feature_names(df_user)
        df, classes = CR.write_file(df_user, default_col, uuid)
        #df_binned = Processing.bin_numerical_features(df) # uncomment to get df where numerical features are binned

        df.columns = df.columns.str.upper()
        features = list(df.columns.values)
        #print(features)
        num_features = len(features) - 1

        if default_col is not None:
            default_col_index = features.index(default_col)

        if feature_type == -1:
            features_dict = Processing.categorize_features_numerical(df)
            feature_type = features_dict.get(col_name, 0)

        #print(feature_type)
        if feature_type == 0:
            x_vals = []
            y_vals = []
            woe = WOE()
            df[col_name] = df[col_name].astype('category')
            df_grouped = df.groupby([col_name], as_index=False).size()
            df_ = df_grouped.to_frame()
            category_nparray = df.as_matrix()
            for idx, row in df_.iterrows():
                x_vals.append(idx)
                y_vals.append(row[0])
                if len(x_vals) > 200:
                    sample_list = sorted(
                        random.sample(range(0, len(x_vals)), 200))
                else:
                    sample_list = range(0, len(x_vals))
            return json.dumps({
                'feature':
                col_name,
                'x_vals': [str(x_vals[i]) for i in sample_list],
                'y_vals': [str(y_vals[i]) for i in sample_list],
                #'woe': woe.woe_single_x(category_nparray[:, features.index(col_name)],
                #                    category_nparray[:, num_features].astype(bool))
                'woe':
                woe.woe_single_x_score(
                    category_nparray[:, features.index(col_name)],
                    category_nparray[:, features.index(default_col)].astype(
                        bool)),
                'feature_type':
                feature_type
            })
        elif feature_type == 1:
            woe = WOE()
            category_nparray = df.as_matrix()
            x_hist = []
            y_hist = []
            min_val = str(df[col_name].dropna().min())
            max_val = str(df[col_name].dropna().max())
            mean_val = str(df[col_name].dropna().mean())
            median_val = str(df[col_name].dropna().median())
            mode_val = str(df[col_name].dropna().mode()[0])
            tot_null = str(df[col_name].isnull().sum())
            std_val = str(df[col_name].dropna().std())
            var_val = str(df[col_name].dropna().var())
            high_corrs = str(Processing.get_corr_coeffs(df, col_name))
            count_outliers = str(Processing.get_outliers_count(df, col_name))
            percentage_missing = str(
                Processing.get_missing_percent(df, col_name))
            #_, iv = woe.woe_single_x(category_nparray[:, features.index(col_name)], category_nparray[:, num_features].astype(bool))
            _, iv = woe.woe_single_continuous_feature(
                df[[col_name, default_col]], col_name, default_col)
            #_, iv = woe.woe_single_x(category_nparray[:, features.index(col_name)], category_nparray[:, features.index(default_col)].astype(bool))
            #print(col_name, " numerical iv:", iv)
            return json.dumps({
                'feature': col_name,
                'x_hist': df[col_name].values.tolist(),
                'y_hist': y_hist,
                'min_val': min_val,
                'max_val': max_val,
                'mean_val': mean_val,
                'median_val': median_val,
                'mode_val': mode_val,
                'tot_null': tot_null,
                'std_val': std_val,
                'var_val': var_val,
                'high_corrs': high_corrs,
                'count_outliers': count_outliers,
                'percentage_missing': percentage_missing,
                'iv': iv,
                'feature_type': feature_type
            })
    def run(self,
            default_column='DEFAULT PAYMENT NEXT MONTH',
            corr_threshold=1.0):
        # download file from url
        filename = self.url.rsplit('/', 1)[-1]
        uuid = os.path.splitext(os.path.basename(filename))[0]
        print("RandomForest.self.url:", self.url)
        CR = CreditRisk(self.url, uuid)
        Processing = FeatureProcessing()

        # check if file exists locally, if not download
        df = CR.read_file()

        df = Processing.sort_df_by_feature_names(df)

        df, classes = CR.write_file(df, default_column, uuid)

        # drop ignored features
        if self.ignore_features is not None:
            for feature in self.ignore_features:
                df = df.drop(feature, axis=1)
        #df = CR.bin_numerical_features(df) # uncomment if needed

        # remove highly correlated features
        features_to_drop = Processing.list_highly_corr_features(
            df, corr_threshold)
        #print(features_to_drop)
        df = Processing.remove_features(df, features_to_drop)

        # partition
        features = list(df)
        #print(features)
        X, y = CR.test_train_matrix(df, default_column)
        X_train, X_test, y_train, y_test = CR.split_train_test(
            X, y, self.test_size, False, len(features))

        # create, fit model
        # TODO need to do grid search to find optimal hyperparams
        clf = RandomForestClassifier(n_estimators=1000, min_samples_leaf=2)
        clf.fit(X_train, y_train)
        print("random forest accuracy: " +
              str(CR.get_model_accuracy(clf, X_test, y_test)))

        #print("log loss: ", self.log_loss(clf, X_train, y_train))

        # TODO may not need this: get list of most important features
        '''importance = []
        for feature in zip(features, clf.feature_importances_):
            importance.append(feature)
        importance = sorted(importance, key=itemgetter(1), reverse=True) # descending order
        print(importance)'''

        # save model
        #uuid_t = uuid.uuid4().hex
        uuid_t = CR.save_model(clf)

        # get probability estimates
        y_train_pred = clf.predict_proba(X_train)[:, 1]
        y_test_pred = clf.predict_proba(X_test)[:, 1]

        # get target col index
        if default_column is not None:
            default_col_index = features.index(default_column)

        # uncomment to see kfolds score
        #kfolds_cv_score = self.kfold_cv(CR, X.as_matrix(), y.as_matrix())
        #print(kfolds_cv_score)

        c_nparray = df.as_matrix()
        score = []
        woe_dict = {}
        features.pop()

        # TODO make credit scores from probability of default
        woe = WOE()
        for l in features:
            woe_dict[l] = woe.woe_single_x_score(
                c_nparray[:, features.index(l)],
                c_nparray[:, default_col_index].astype(bool))
        for index, row in df.iterrows():
            woe_val = 0
            # Assume that As the score points are just another way to denote the scorecard, they do not affect its predictive power.
            # Now, we can assume that 600 score points correspond to odds (bads to goods) of 1:50,
            #  while each additional 20 points double the odds (620 points double the odds to 1:100, 640 score points double to 1:200 etc.).
            startScore = 600
            pdo = 20
            factor = pdo / math.log(2)
            offset = round(startScore - (factor * math.log(50)))
            for l in features:
                cell_value = df.at[index, l]
                col_index = features.index(l)
                woe_val = woe_val + \
                    woe_dict[l][cell_value] * math.log(2) / (1 - math.log(2))
            score.append(round(offset + woe_val))
            #print('Rows:: ', index, 'Score:: ', score[index])

        return CR.scores(score, y_train, y_train_pred, y_test, y_test_pred,
                         classes, uuid_t)
Example #7
0
    def fit(self, X, y=None):
        """
        1. 逐个特征基于熵值离散化成nbin个区间 
        2. 计算计算特征woe及iv值
        3. 保存转换后的离散特征和woe特征
        4. 保存特征值的iv及离散区间
        :param X: (N,M)
        :param y: (N,1)
        :return: 
        """
        # dataset = pd.read_table(self.fn_raw_train, sep=',', header=0)
        # labels = dataset.pop(self.labelname)
        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)
        user_info = X.loc[:, self.sample_columns]
        logging('user_info:{}'.format(user_info.columns))
        dataset = X.drop(user_info, axis=1).fillna(0).astype(np.int32)
        labels = y
        logging('Begin to discretize features', dataset.shape)
        start = time.clock()

        woe = WOE()
        woe_arr = []
        to_drop = []
        for column in dataset.columns:
            desc = dataset[column].describe(percentiles=[0.98])
            minv, maxv = max(-1, desc['min']), round(desc['98%'])
            features = dataset[column].clip(minv, maxv)
            n_uniques = features.nunique()
            if n_uniques < 2 or desc['std'] < 0.05:
                to_drop.append(column)
            else:
                if n_uniques < 1000:
                    feature_values = features.apply(lambda x: int(x))
                    seg_ents_keys_sorted = self._segment(feature_values, labels)
                else:
                    # 对数化、离散化: 拉伸低频,压缩高频,系数补偿、饱和特性、平滑低频段的震荡。
                    feature_values = features.apply(lambda x: int(log(x - minv + 0.1 ** 8, 1.01)))
                    seg_ents_keys_sorted = [
                        (round(1.01 ** seg_ents_keys_sorted[0]) + minv, round(1.01 ** seg_ents_keys_sorted[1]) + minv)
                        for seg_ents_keys_sorted in self._segment(feature_values, labels)]
                seg_index = features.apply(categorizing, args=(seg_ents_keys_sorted,))
                woe_dict, iv = woe.woe_single_x(seg_index, labels)
                logging(
                    '{}({}, {}), iv: {}, intervals: {}'.format(column, minv, maxv, round(iv, 4), seg_ents_keys_sorted))
                assert len(seg_ents_keys_sorted) == len(woe_dict), '{} ---- {}'.format(seg_ents_keys_sorted, woe_dict)
                if iv <= 0.02:
                    to_drop.append(column)
                else:
                    woe_arr.append(woe_dict)
                    self.iv_dict[column], self.woes_dict[column] = iv, woe_dict
                    dataset.loc[:, column], self.intervals_dict[column] = seg_index, seg_ents_keys_sorted
        if to_drop:
            dataset.drop(to_drop, axis=1, inplace=True)

        logging('End to discretize features', dataset.shape)

        self.selected_columns = dataset.columns

        # dis特征处理
        # temp_dataset = woe.woe_replace(dataset, np.array(woe_arr))
        # woe_dataset = pd.DataFrame(X, columns=self.selected_columns)
        # dis特征存储
        dis_dataset = pd.concat([user_info, dataset], axis=1)
        dis_dataset.insert(dis_dataset.shape[1], labels.name, labels)
        dis_dataset.to_csv(self.fn_dis_train, index=False)
        self._create_dis_hql(self.tablename)
        logging('End to story dis features', dis_dataset.shape)
        # woe特征处理
        temp_dataset = woe.woe_replace(dataset, np.array(woe_arr))
        woe_dataset = pd.DataFrame(temp_dataset, columns=self.selected_columns)
        # woe特征存储
        woe_dataset = pd.concat([user_info, woe_dataset], axis=1)
        woe_dataset.insert(woe_dataset.shape[1], labels.name, labels)
        woe_dataset.to_csv(self.fn_woe_train, index=False)
        self._create_woe_hql(self.tablename)
        logging('End to story woe features', woe_dataset.shape)

        cPickle.dump(self.woes_dict, open(self.fn_woes_dict, 'wb'))
        cPickle.dump(self.intervals_dict, open(self.fn_intervals_dict, 'wb'))
        with open(self.fn_ivs_dict, 'w') as fp:
            json.dump(self.iv_dict, fp, encoding='utf-8')

        if self.feature_selection is True:
            self.dis_rm_columns = pearson_ccs(dis_dataset, self.iv_dict, ratio=self.ratio)
            dis_rm_dataset = dis_dataset.drop(self.dis_rm_columns, axis=1)
            # dis_rm_dataset.insert(dis_rm_dataset.shape[1], labels.name, labels)
            # dis_rm_dataset = pd.concat([user_info, dis_rm_dataset], axis=1)
            dis_rm_dataset.to_csv(self.fn_dis_rm_train, index=False)
            logging('dis feature after feature selection', dis_rm_dataset.shape)

            self.woe_rm_columns = pearson_ccs(woe_dataset, self.iv_dict, self.ratio)
            woe_rm_dataset = woe_dataset.drop(self.woe_rm_columns, axis=1)
            # woe_rm_dataset.insert(woe_rm_dataset.shape[1], labels.name, labels)
            # woe_rm_dataset = pd.concat([user_info, woe_rm_dataset], axis=1)
            woe_rm_dataset.to_csv(self.fn_woe_rm_train, index=False)
            logging('woe feature after feature selection', woe_rm_dataset.shape)

        logging('Discretizing features finished.', dataset.shape, 'Time elapsed: %.2f s' % (time.clock() - start))
        return self
Example #8
0
    def run(self,
            default_column='DEFAULT PAYMENT NEXT MONTH',
            corr_threshold=1.0):
        # download file from url
        filename = self.url.rsplit('/', 1)[-1]
        uuid = os.path.splitext(os.path.basename(filename))[0]
        print("LogisticRegression.self.url:", self.url)
        CR = CreditRisk(self.url, uuid)
        Processing = FeatureProcessing()

        # check if file exists locally, if not download
        df = CR.read_file()

        df = Processing.sort_df_by_feature_names(df)

        df, classes = CR.write_file(df, default_column, uuid)

        # drop ignored features
        if self.ignore_features is not None:
            for feature in self.ignore_features:
                df = df.drop(feature, axis=1)
        #df = CR.bin_numerical_features(df)  # uncomment if needed

        # remove highly correlated features
        features_to_drop = Processing.list_highly_corr_features(
            df, corr_threshold)
        #print(features_to_drop)
        df = Processing.remove_features(df, features_to_drop)

        # partition
        features = list(df)
        #print(features)
        X, y = CR.test_train_matrix(df, default_column)
        X_train, X_test, y_train, y_test = CR.split_train_test(
            X, y, self.test_size, False, len(features))

        # create, fit model
        clf = linear_model.LogisticRegression()
        clf.fit(X_train, y_train)

        # save model
        #uuid_t = uuid.uuid4().hex
        uuid_t = CR.save_model(clf)

        # get probability estimates
        y_train_pred = clf.predict_proba(X_train)[:, 1]
        y_test_pred = clf.predict_proba(X_test)[:, 1]

        # get target col index
        if default_column is not None:
            default_col_index = features.index(default_column)

        c_nparray = df.as_matrix()
        score = []
        woe_dict = {}
        features.pop()

        woe = WOE()
        for l in features:
            woe_dict[l] = woe.woe_single_x_score(
                c_nparray[:, features.index(l)],
                c_nparray[:, default_col_index].astype(bool))
        for index, row in df.iterrows():
            woe_val = 0
            # Assume that As the score points are just another way to denote the scorecard, they do not affect its predictive power.
            # Now, we can assume that 600 score points correspond to odds (bads to goods) of 1:50,
            #  while each additional 20 points double the odds (620 points double the odds to 1:100, 640 score points double to 1:200 etc.).
            startScore = 600
            pdo = 20
            factor = pdo / math.log(2)
            offset = round(startScore - (factor * math.log(50)))
            for l in features:
                cell_value = df.at[index, l]
                col_index = features.index(l)
                #print(l, ":", col_index)
                woe_val = woe_val + (woe_dict[l][cell_value] *
                                     clf.coef_[0][col_index]) + (
                                         clf.intercept_[0] / len(features))
            score.append(round(offset + woe_val))

        print("logistic regression accuracy: " +
              str(CR.get_model_accuracy(clf, X_test, y_test)))
        return CR.scores(score, y_train, y_train_pred, y_test, y_test_pred,
                         classes, uuid_t)