def check_stratification(data, k_fold_files, hold_out_files):
    print('Ratio is_bad "1" to "0" in original file:')
    original_count = data['is_bad'].value_counts()
    print('\t0: {0:d}'.format(original_count['0']))
    print('\t1: {0:d}'.format(original_count['1']))
    print('\tratio: {0:.3f}'.format(original_count['0'] * 1.0 /
                                    original_count['1']))
    print('Ratio is_bad "1" to "0" in holdout train file:')
    data_train = read_Nstr_from_Csv(hold_out_files['train_files'][0])
    train_count = data_train['is_bad'].value_counts()
    print('\t0: {0:d}'.format(train_count['0']))
    print('\t1: {0:d}'.format(train_count['1']))
    print('\tratio: {0:.3f}'.format(train_count['0'] * 1.0 / train_count['1']))
    print('Ratio is_bad "1" to "0" in  holdout validation file:')
    data_validation = read_Nstr_from_Csv(hold_out_files['validation_files'][0])
    validation_count = data_validation['is_bad'].value_counts()
    print('\t0: {0:d}'.format(validation_count['0']))
    print('\t1: {0:d}'.format(validation_count['1']))
    print('\tratio: {0:.3f}'.format(validation_count['0'] * 1.0 /
                                    validation_count['1']))

    print('Ratio is_bad "1" to "0" in  k-fold files:')
    for name in (k_fold_files['train_files'] +
                 k_fold_files['validation_files']):
        data_current = read_Nstr_from_Csv(name)
        current_count = data_current['is_bad'].value_counts()
        print('\tname: {0:s}'.format(name))
        print('\t0: {0:d}'.format(current_count['0']))
        print('\t1: {0:d}'.format(current_count['1']))
        print('\tratio: {0:.3f}'.format(current_count['0'] * 1.0 /
                                        current_count['1']))
Beispiel #2
0
def generate_settings(data_dict_path, train_path):
    preprocessed_data = read_Nstr_from_Csv(train_path)
    selected_columns = []
    variable_assignment = {
        "target": CONFIG['target_column'],
        "description": []
    }
    data_dict = read_Nstr_from_Csv(data_dict_path)
    processed = []
    for i in range(data_dict.shape[0]):
        current_variable = data_dict.at[i, 'Column Name']
        if current_variable not in CONFIG['remove_columns']:
            current_variable = current_variable.lower().strip().replace(
                '-', '_')
            counts = preprocessed_data[current_variable].value_counts()
            unique_values = list(counts.keys())
            if '' in unique_values:
                unique_values.remove('')
            if len(unique_values) >= 2:
                selected_columns.append(current_variable)
            if data_dict.at[i, 'Type'] == 'Categorical' or data_dict.at[
                    i, 'Column Name'] in ['emp_title', 'is_bad']:
                if current_variable == variable_assignment["target"]:
                    type_current = "binary"
                else:
                    type_current = "categorical"
                variable_assignment["description"].append({
                    "name":
                    current_variable,
                    "type":
                    type_current,
                    "uniqueValues":
                    unique_values
                })
            else:
                variable_assignment["description"].append({
                    "name":
                    current_variable,
                    "type":
                    "numerical",
                    "uniqueValues":
                    unique_values
                })
            processed.append(current_variable)
    all_columns = set(preprocessed_data.columns)
    remaining = all_columns.difference(processed)
    for key in remaining:
        counts = preprocessed_data[key].value_counts()
        unique_values = list(counts.keys())
        if '' in unique_values:
            unique_values.remove('')
        if len(unique_values) >= 2:
            selected_columns.append(key)
        variable_assignment["description"].append({
            "name": key,
            "type": "binary",
            "uniqueValues": ['0', '1']
        })
    variable_assignment["selectedColumns"] = selected_columns
    return variable_assignment
def full_file_processing(full_train_file, variable_assignment,
                         encoders, algorithm_list=['logistic_regression','gradient_boosting_classifier']):
    data_from_file = read_Nstr_from_Csv(full_train_file)
    _data = data_from_file.copy()
    x, y, counting_dictionary, variable_assignment = refineInputData(_data, variable_assignment, encoders,False)
    if y.shape[0] == 0:
        raise ValueError('Target column with name: "' + variable_assignment["target"] + '" is not provided')
    importance_val_pairs = get_tree_importance(x, y.values.reshape(x.shape[0]), counting_dictionary, encoders, )

    all_saved_models = []
    for algorithm_name in algorithm_list:
        if not (algorithm_name in CONFIG["supported_algorithms"]):
            error_message = "Algorithm with name " + str(algorithm_name) + " is not supported." +\
                            "Select one from the list " + str(CONFIG["supported_algorithms"])
            raise Exception(error_message)
        train_method = get_function_by_name(function_name=algorithm_name + '_train',
                                                module_name=algorithm_name)
        model = train_method(x, y)
        model_path = '../data/full_model_' + str(algorithm_name) + '.bin'
        save_obj(algorithm_name, model_path)
        save_obj(encoders, model_path)
        save_obj(variable_assignment, model_path)
        save_obj(model, model_path)
        all_saved_models.append(model_path)
    return all_saved_models, importance_val_pairs
def crossValidation(split_parameters,
                    algorithm_list=['logistic_regression', 'gradient_boosting_classifier'],
                    threshold_step=0.01):

    working_directory = '../data/'

    if len(algorithm_list) < 1:
        raise ValueError("At least 1 algorithm should be selected for training")

    train_names_list = split_parameters["train_files"]
    valid_names_list = split_parameters["validation_files"]
    encoders_list = split_parameters["encoders"]
    description_list = split_parameters["statistics"]

    all_files = train_names_list.copy()
    all_files.extend(valid_names_list)
    for file_name in all_files:
        if not os.path.isfile(file_name):
            error_message = 'File "' + file_name + '" with input data does not exist.'
            raise ValueError(error_message)

    if threshold_step <= 0 or threshold_step >= 1:
        error_message = '"threshold_step" parameter should be a float number between 0.0' \
                        ' and 1.0. Current value is "' \
                        + str(threshold_step) + '" with input data does not exist.'
        raise ValueError(error_message)

    thresholds = numpy.arange(0, 1 + threshold_step, threshold_step)
    thresholds = numpy.concatenate((thresholds, numpy.array([0, 0.5, 1])))
    thresholds = numpy.unique(thresholds)

    for algorithm_description in algorithm_list:
        if not (algorithm_description in CONFIG["supported_algorithms"]):
            error_message = "Algorithm with name " + str(algorithm_description) + " is not supported." +\
                            "Select one from the list " + str(CONFIG["supported_algorithms"])
            raise Exception(error_message)

    all_saved_models = {}
    file_index = 0
    for train_name in train_names_list:
        base_name = get_base_name(train_name)
        data_from_file = read_Nstr_from_Csv(train_name)
        _data = data_from_file.copy()
        current_variables = description_list[file_index].copy()
        x, y, counting_dictionary,current_variables = refineInputData(_data, current_variables, encoders_list[file_index])
        description_list[file_index] = current_variables
        if y.shape[0] == 0:
            raise ValueError('Target column with name: "' + CONFIG["target_column"] + '" is not provided')
        for algorithm_description in algorithm_list:
            all_saved_models.setdefault(algorithm_description,[])
            train_method = get_function_by_name(function_name=algorithm_description + '_train',
                                                module_name=algorithm_description)
            model_path = correct_path(base_name + algorithm_description, None, True,
                                      working_directory, ".bin",False)
            model = train_method(x, y)
            all_saved_models[algorithm_description].append(model_path)
            save_list = model
            save_obj(save_list, model_path, is_first=True)
        file_index += 1
    final_validation = None

    for algorithm_description in algorithm_list:
        predict_method = get_function_by_name(function_name=algorithm_description + '_predict',
                                                module_name=algorithm_description)

        validation_results, total_count = cross_modelEvaluation(description_list,valid_names_list,
                                                   encoders_list, predict_method,
                                                   thresholds,
                                                   all_saved_models[algorithm_description],
                                                   algorithm_description)
        if final_validation is None:
            final_validation = {"thresholds": [float(Decimal("%.2f" % elem)) for elem in list(thresholds)],
                                "totalCount": total_count,
                                "algorithms": []
                                }
        final_validation["algorithms"].append(validation_results)

    for key in all_saved_models:
        for path_element in all_saved_models[key]:
            try:
                os.remove(path_element)
            except Exception as e:
                print(e)

    return final_validation
def one_fileEvaluation(scheme_data,model,
                       load_file_name,encoders,thresholds,
                       predict_method):


    selected_encoder = encoders["label"][scheme_data["target"]]

    for item in scheme_data["description"]:
        if item["name"] == scheme_data["target"]:
            if "uniqueValues" in item.keys():
                current_unique = item["uniqueValues"]
            else:
                error_message = "Scheme is corrupted. Cannot find unique values for tatget column."
                raise Exception(error_message)
            break
    class_names = [str(element) for element in current_unique]
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    model_names = selected_encoder.inverse_transform(model.classes_)
    warnings.resetwarnings()
    model_names = [class_names.index(i) for i in model_names]
    classes_number = len(class_names)
    positive_index = class_names.index('1')
    zero_matrix = numpy.zeros((classes_number, classes_number), dtype=numpy.int32)

    confusion_threshold = []

    for j in range(len(thresholds)):
        confusion_threshold.append(zero_matrix.copy())

    total_count = 0

    data_from_file = read_Nstr_from_Csv(load_file_name)

    columns = data_from_file.columns
    columns = [c.strip() for c in columns]
    missing_columns = set(scheme_data["selectedColumns"])
    missing_columns = missing_columns.difference(columns)

    if len(missing_columns) > 0:
        exception_string = 'Columns: ' + str(missing_columns) + \
                           ' required for prediction are not provided in description.'
        raise ValueError(exception_string)

    if data_from_file.shape[0] < 1:
        exception_string = 'Empty file was provided for validation. Try adjusting data partition parameters.'
        raise ValueError(exception_string)

    read_data = data_from_file.shape[0]
    total_count += read_data

    x, y, counting_dictionary, _ = refineInputData(data_from_file.copy(), scheme_data, encoders)

    old_shape = y.shape
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    predicted_y_name = encoders["label"][scheme_data["target"]].inverse_transform(y)
    warnings.resetwarnings()
    y = numpy.asarray([class_names.index(i) for i in predicted_y_name], dtype=numpy.int32)
    y.shape = old_shape

    importance_accuracy = {key: 0.0 for key in scheme_data["selectedColumns"] if key != scheme_data['target']}
    initial_correct_accuracy = 0.0
    importance_logloss = {key: 0.0 for key in scheme_data["selectedColumns"] if key != scheme_data['target']}
    initial_correct_logloss = 0.0

    logLoss_value = 0.0
    importance_check_names = [None] + list(scheme_data["selectedColumns"])
    if scheme_data['target'] in importance_check_names:
        importance_check_names.remove(scheme_data['target'])
    for column_name in importance_check_names:
        if column_name is None:
            predicted_y, predicted_score = predict_method(x, classifier=model)
            predicted_score = predicted_score[:,model_names]
            warnings.filterwarnings("ignore", category=DeprecationWarning)
            predicted_y_name = encoders["label"][scheme_data["target"]].inverse_transform(predicted_y)
            warnings.resetwarnings()
            predicted_y = numpy.asarray([class_names.index(i) for i in predicted_y_name],dtype=numpy.int32)
            old_shape = predicted_y.shape
            predicted_y.shape = (predicted_y.size, 1)
            initial_correct_accuracy += numpy.sum(numpy.equal(y, predicted_y))
            logLoss_value = log_loss(predicted_score,y,positive_index)
            predicted_y.shape = old_shape
        else:
            current_description = None
            for elem in scheme_data["description"]:
               if elem["name"] == column_name:
                   current_description = elem
                   break
            start_column = counting_dictionary[column_name][0]
            end_column = counting_dictionary[column_name][1]
            end_column += start_column
            old_value = x.values[:, start_column:end_column]
            if ("mean" in current_description.keys()) and (
                       "std" in current_description.keys()):
               x.values[:, start_column] = numpy.random.normal(0.0, 1.0, size=x.shape[0])
            else:
               if "uniqueValues" in current_description.keys():
                   current_unique = current_description["uniqueValues"]
               else:
                   current_unique = []
               if len(current_unique) > 0:
                   selected_ind = numpy.asarray(
                       numpy.random.choice(len(current_unique), size=x.shape[0],
                                          replace=True), dtype=int)
                   generated = numpy.asarray(current_unique)[selected_ind]
                   if current_description["type"] == "numerical":
                       labeled_code = to_numeric(generated,errors='coerce',downcast='float')
                       labeled_code.shape = (labeled_code.size, 1)
                       x.values[:, start_column:end_column] = labeled_code
                   elif current_description["type"] in ["categorical", "binary"]:
                       labeled_code = encoders["label"][column_name].transform(generated)
                       labeled_code.shape = (labeled_code.size, 1)
                       if current_description["type"] == "categorical":
                           labeled_code = encoders["onehot"][column_name].transform(labeled_code)
                           x.values[:, start_column:end_column] = labeled_code
               else:
                   permutation_index = numpy.random.permutation(x.shape[0])
                   x.values[:, start_column:end_column] = x.values[permutation_index,
                                                          start_column:end_column]

            predicted_y_column, predicted_score_column = predict_method(x,classifier=model)
            predicted_score_column = predicted_score_column[:, model_names]
            warnings.filterwarnings("ignore", category=DeprecationWarning)
            predicted_y_name = encoders["label"][scheme_data["target"]].inverse_transform(predicted_y_column)
            warnings.resetwarnings()
            predicted_y_column = numpy.asarray([class_names.index(i) for i in predicted_y_name],dtype=numpy.int32)
            x.values[:, start_column:end_column] = old_value

            if len(predicted_y_column.shape) < 2:
               predicted_y_column.shape = (predicted_y_column.size, 1)
            importance_accuracy[column_name] += numpy.sum(numpy.equal(y, predicted_y_column))
            importance_logloss[column_name] += log_loss(predicted_score_column, y, positive_index)

    initial_confusion = zero_matrix.copy()
    initial_confusion += get_confusion_matrix(y, predicted_y, classes_number)

    for j in range(len(thresholds)):
        for l in range(len(y)):
            threshold_mask = predicted_score[l, :] >= thresholds[j]
            if numpy.count_nonzero(threshold_mask) > 1 and classes_number > 2:
                ind = numpy.min(numpy.argmax(predicted_score[l, :]))
            elif numpy.count_nonzero(threshold_mask) > 1:
                ind = 0
            elif numpy.count_nonzero(threshold_mask) == 0:
                ind = classes_number - 1
            else:
                ind = numpy.argmax(threshold_mask)
            if y[l] >= classes_number:
                continue
            confusion_threshold[j][ind, y[l]] += 1
    return [total_count, initial_confusion, confusion_threshold,
            initial_correct_accuracy, importance_accuracy,
            initial_correct_logloss, importance_logloss,
            logLoss_value]
Beispiel #6
0
def filter_text_and_categorical_values(data_path, save_path):
    data = read_Nstr_from_Csv(data_path)
    target_column = CONFIG['target_column']

    # convert all text to lower case
    data = data.applymap(make_lower)

    # convert date of opening line to number of days passed since
    data['earliest_cr_line'] = data['earliest_cr_line'].apply(date_coversion)

    # make reassignment of employers for better frequency of data distribution
    data['emp_title'] = data['emp_title'].apply(filter_employment)

    # compute frequencies for each of the employment types
    all_employers = {}
    data['emp_title'].apply(word_frequency, all_employers=all_employers)

    # compute frequency of each of the word in Notes excluding stop words
    all_word_in_notes = {}
    data['Notes'] = data['Notes'].apply(filter_notes,
                                        all_word=all_word_in_notes)

    # keep words only exceeding threshold
    keep_words = []
    for key in all_word_in_notes.keys():
        if all_word_in_notes[key] > 400:
            keep_words.append(key)
    data['Notes'] = data['Notes'].apply(clear_notes_using_dictionary,
                                        all_words_kept=keep_words)

    # keep employment types only if exceeding threshold
    keep_employers = []
    for key in all_employers.keys():
        if all_employers[key] > 40:
            keep_employers.append(key)

    data['emp_title'] = data['emp_title'].apply(
        clear_employment_using_dictionary, all_employments_kept=keep_employers)

    keep_states = []
    address_statistics = data['addr_state'].value_counts()
    for addr_key in address_statistics.keys():
        if address_statistics[addr_key] > 100:
            keep_states.append(addr_key)
    data['addr_state'] = data['addr_state'].apply(
        clear_employment_using_dictionary, all_employments_kept=keep_states)

    keep_zips = []
    address_statistics = data['zip_code'].value_counts()
    for addr_key in address_statistics.keys():
        if address_statistics[addr_key] > 100:
            keep_zips.append(addr_key)

    data['zip_code'] = data['zip_code'].apply(
        clear_employment_using_dictionary, all_employments_kept=keep_zips)

    data['purpose_cat'] = data['purpose_cat'].apply(
        purpose_handling_small_buisness, keep_business=False)

    word_data_frame = DataFrame(0,
                                dtype=np.int64,
                                index=np.arange(data.shape[0]),
                                columns=[('word_' + word)
                                         for word in keep_words])
    columnn_notes = data['Notes'].values
    for i in range(data.shape[0]):
        if len(columnn_notes[i]) == 0:
            continue
        record_words = columnn_notes[i]
        for word in record_words:
            word_data_frame.at[i, 'word_' + word] = 1

    data = data.drop(columns=CONFIG['remove_columns'])
    data = pandas.concat([data, word_data_frame], axis=1)
    filtered_columns = [name.lower().strip() for name in data.columns]
    data.columns = filtered_columns
    data.to_csv(save_path, index=False, encoding='utf-8')
        #print('The plot was saved to: {}'.format(save_path))


if __name__ == '__main__':
    if not sys.warnoptions:
        import warnings
        warnings.simplefilter("ignore")

    # input dataset file
    data_path = '../data/dataset.csv'
    data_dict_path = '../data/data_dictionary.csv'

    # where to save preliminary version after columns removal and
    preprocessed_data_path = '../data/preprocessed_dataset.csv'
    filter_text_and_categorical_values(data_path, preprocessed_data_path)
    data = read_Nstr_from_Csv(preprocessed_data_path)
    k_fold_files = split_for_validation(data, CONFIG['target_column'],
                                        'k-fold', CONFIG['k_fold_k'],
                                        CONFIG['hold_out_train_size'])
    hold_out_files = split_for_validation(data, CONFIG['target_column'],
                                          'random', CONFIG['k_fold_k'],
                                          CONFIG['hold_out_train_size'])

    k_fold_files["statistics"] = []
    k_fold_files["encoders"] = []
    for path in k_fold_files['train_files']:
        current_statistics = generate_settings(data_dict_path, path)
        #current_statistics["selectedColumns"] = ['addr_state','is_bad','zip_code']
        k_fold_files["statistics"].append(current_statistics)
        k_fold_files["encoders"].append(
            get_categorical_encoders(k_fold_files["statistics"][-1]))