def gpu_load_names(**kwargs): """ Loads names used for renaming the banks Returns ------- GPU DataFrame """ chronometer = Chronometer.makeStarted() cols = ['seller_name', 'new_seller_name'] dtypes = OrderedDict([ ("seller_name", "category"), ("new_seller_name", "category"), ]) names_table = pyblazing.create_table(table_name='names', type=get_type_schema(col_names_path), path=col_names_path, delimiter='|', names=cols, dtypes=get_dtype_values(dtypes), skip_rows=1) Chronometer.show(chronometer, 'Read Names CSV') return names_table
def create_delinq_features(table, **kwargs): chronometer = Chronometer.makeStarted() query = """SELECT loan_id, min(monthly_reporting_period) as delinquency_30 FROM main.perf where current_loan_delinquency_status >= 1 group by loan_id""" result_delinq_30 = pyblazing.run_query(query, {table.name: table.columns}) query = """SELECT loan_id, min(monthly_reporting_period) as delinquency_90 FROM main.perf where current_loan_delinquency_status >= 3 group by loan_id""" result_delinq_90 = pyblazing.run_query(query, {table.name: table.columns}) query = """SELECT loan_id, min(monthly_reporting_period) as delinquency_180 FROM main.perf where current_loan_delinquency_status >= 6 group by loan_id""" result_delinq_180 = pyblazing.run_query(query, {table.name: table.columns}) new_tables = { "delinq_30": result_delinq_30.columns, "delinq_90": result_delinq_90.columns, "delinq_180": result_delinq_180.columns } query = """SELECT d30.loan_id, delinquency_30, COALESCE(delinquency_90, DATE '1970-01-01') as delinquency_90, COALESCE(delinquency_180, DATE '1970-01-01') as delinquency_180 FROM main.delinq_30 as d30 LEFT OUTER JOIN main.delinq_90 as d90 ON d30.loan_id = d90.loan_id LEFT OUTER JOIN main.delinq_180 as d180 ON d30.loan_id = d180.loan_id""" result_merge = pyblazing.run_query(query, new_tables) Chronometer.show(chronometer, 'Create deliquency features') return result_merge
def final_performance_delinquency(gdf, joined_df, **kwargs): chronometer = Chronometer.makeStarted() tables = {"gdf": gdf, "joined_df": joined_df} query = """SELECT g.loan_id, current_actual_upb, current_loan_delinquency_status, delinquency_12, interest_rate, loan_age, mod_flag, msa, non_interest_bearing_upb FROM main.gdf as g LEFT OUTER JOIN main.joined_df as j ON g.loan_id = j.loan_id and EXTRACT(YEAR FROM g.monthly_reporting_period) = j.timestamp_year and EXTRACT(MONTH FROM g.monthly_reporting_period) = j.timestamp_month """ results = pyblazing.run_query(query, tables) Chronometer.show(chronometer, 'Final performance delinquency') return results
def join_perf_acq_gdfs(perf, acq, **kwargs): chronometer = Chronometer.makeStarted() tables = {"perf": perf, "acq": acq} query = """SELECT p.loan_id, current_actual_upb, current_loan_delinquency_status, delinquency_12, interest_rate, loan_age, mod_flag, msa, non_interest_bearing_upb, borrower_credit_score, dti, first_home_buyer, loan_purpose, mortgage_insurance_percent, num_borrowers, num_units, occupancy_status, orig_channel, orig_cltv, orig_date, orig_interest_rate, orig_loan_term, orig_ltv, orig_upb, product_type, property_state, property_type, relocation_mortgage_indicator, seller_name, zip FROM main.perf as p LEFT OUTER JOIN main.acq as a ON p.loan_id = a.loan_id""" results = pyblazing.run_query(query, tables) Chronometer.show(chronometer, 'Join performance acquitistion gdfs') return results
def create_ever_features(table, **kwargs): chronometer = Chronometer.makeStarted() query = """SELECT loan_id, max(current_loan_delinquency_status) >= 1 as ever_30, max(current_loan_delinquency_status) >= 3 as ever_90, max(current_loan_delinquency_status) >= 6 as ever_180 FROM main.perf group by loan_id""" result = pyblazing.run_query(query, {table.name: table.columns}) Chronometer.show(chronometer, 'Create Ever Features') return result
def combine_joined_12_mon(joined_df, testdf, **kwargs): chronometer = Chronometer.makeStarted() tables = {"joined_df": joined_df, "testdf": testdf} query = """SELECT j.loan_id, j.mrp_timestamp, j.timestamp_month, j.timestamp_year, j.ever_30, j.ever_90, j.ever_180, j.delinquency_30, j.delinquency_90, j.delinquency_180, t.delinquency_12, t.upb_12 FROM main.joined_df as j LEFT OUTER JOIN main.testdf as t ON j.loan_id = t.loan_id and j.timestamp_year = t.timestamp_year and j.timestamp_month = t.timestamp_month""" results = pyblazing.run_query(query, tables) Chronometer.show(chronometer, 'Combine joind 12 month') return results
def join_ever_delinq_features(everdf_tmp, delinq_merge, **kwargs): chronometer = Chronometer.makeStarted() tables = {"everdf": everdf_tmp, "delinq": delinq_merge} query = """SELECT everdf.loan_id as loan_id, ever_30, ever_90, ever_180, COALESCE(delinquency_30, DATE '1970-01-01') as delinquency_30, COALESCE(delinquency_90, DATE '1970-01-01') as delinquency_90, COALESCE(delinquency_180, DATE '1970-01-01') as delinquency_180 FROM main.everdf as everdf LEFT OUTER JOIN main.delinq as delinq ON everdf.loan_id = delinq.loan_id""" result_merge = pyblazing.run_query(query, tables) Chronometer.show(chronometer, 'Create ever deliquency features') return result_merge
def last_mile_cleaning(df, **kwargs): chronometer = Chronometer.makeStarted() for col, dtype in df.dtypes.iteritems(): if str(dtype) == 'category': df[col] = df[col].cat.codes df[col] = df[col].astype('float32') df['delinquency_12'] = df['delinquency_12'] > 0 df['delinquency_12'] = df['delinquency_12'].fillna(False).astype('int32') for column in df.columns: df[column] = df[column].fillna(-1) Chronometer.show(chronometer, 'Last mile cleaning') return df
def gpu_load_acquisition_csv(acquisition_path, **kwargs): """ Loads acquisition data Returns ------- GPU DataFrame """ chronometer = Chronometer.makeStarted() cols = [ 'loan_id', 'orig_channel', 'seller_name', 'orig_interest_rate', 'orig_upb', 'orig_loan_term', 'orig_date', 'first_pay_date', 'orig_ltv', 'orig_cltv', 'num_borrowers', 'dti', 'borrower_credit_score', 'first_home_buyer', 'loan_purpose', 'property_type', 'num_units', 'occupancy_status', 'property_state', 'zip', 'mortgage_insurance_percent', 'product_type', 'coborrow_credit_score', 'mortgage_insurance_type', 'relocation_mortgage_indicator' ] dtypes = OrderedDict([("loan_id", "int64"), ("orig_channel", "category"), ("seller_name", "category"), ("orig_interest_rate", "float64"), ("orig_upb", "int64"), ("orig_loan_term", "int64"), ("orig_date", "date"), ("first_pay_date", "date"), ("orig_ltv", "float64"), ("orig_cltv", "float64"), ("num_borrowers", "float64"), ("dti", "float64"), ("borrower_credit_score", "float64"), ("first_home_buyer", "category"), ("loan_purpose", "category"), ("property_type", "category"), ("num_units", "int64"), ("occupancy_status", "category"), ("property_state", "category"), ("zip", "int64"), ("mortgage_insurance_percent", "float64"), ("product_type", "category"), ("coborrow_credit_score", "float64"), ("mortgage_insurance_type", "float64"), ("relocation_mortgage_indicator", "category")]) print(acquisition_path) acquisition_table = pyblazing.create_table( table_name='acq', type=get_type_schema(acquisition_path), path=acquisition_path, delimiter='|', names=cols, dtypes=get_dtype_values(dtypes), skip_rows=1) Chronometer.show(chronometer, 'Read Acquisition CSV') return acquisition_table
def merge_names(names_table, acq_table): chronometer = Chronometer.makeStarted() tables = { names_table.name: names_table.columns, acq_table.name: acq_table.columns } query = """SELECT loan_id, orig_channel, orig_interest_rate, orig_upb, orig_loan_term, orig_date, first_pay_date, orig_ltv, orig_cltv, num_borrowers, dti, borrower_credit_score, first_home_buyer, loan_purpose, property_type, num_units, occupancy_status, property_state, zip, mortgage_insurance_percent, product_type, coborrow_credit_score, mortgage_insurance_type, relocation_mortgage_indicator, new_seller_name as seller_name FROM main.acq as a LEFT OUTER JOIN main.names as n ON a.seller_name = n.seller_name""" result = pyblazing.run_query(query, tables) Chronometer.show(chronometer, 'Create Acquisition (Merge Names)') return result
def create_12_mon_features_union(joined_df, **kwargs): chronometer = Chronometer.makeStarted() tables = {"joined_df": joined_df} josh_mody_n_str = "timestamp_year * 12 + timestamp_month - 24000.0" query = "SELECT loan_id, " + josh_mody_n_str + " as josh_mody_n, max(delinquency_12) as max_d12, min(upb_12) as min_upb_12 FROM main.joined_df as joined_df GROUP BY loan_id, " + josh_mody_n_str mastertemp = pyblazing.run_query(query, tables) all_temps = [] all_tokens = [] tables = {"joined_df": mastertemp.columns} n_months = 12 for y in range(1, n_months + 1): josh_mody_n_str = "floor((josh_mody_n - " + str(y) + ")/12.0)" query = "SELECT loan_id, " + josh_mody_n_str + " as josh_mody_n, max(max_d12) > 3 as max_d12_gt3, min(min_upb_12) = 0 as min_upb_12_eq0, min(min_upb_12) as upb_12 FROM main.joined_df as joined_df GROUP BY loan_id, " + josh_mody_n_str metaToken = pyblazing.run_query_get_token(query, tables) all_tokens.append(metaToken) for metaToken in all_tokens: temp = pyblazing.run_query_get_results(metaToken) all_temps.append(temp) y = 1 tables2 = {"temp1": all_temps[0].columns} union_query = "(SELECT loan_id, max_d12_gt3 + min_upb_12_eq0 as delinquency_12, upb_12, floor(((josh_mody_n * 12) + " + str( 24000 + (y - 1)) + ")/12) as timestamp_year, josh_mody_n * 0 + " + str( y) + " as timestamp_month from main.temp" + str(y) + ")" for y in range(2, n_months + 1): tables2["temp" + str(y)] = all_temps[y - 1].columns query = " UNION ALL (SELECT loan_id, max_d12_gt3 + min_upb_12_eq0 as delinquency_12, upb_12, floor(((josh_mody_n * 12) + " + str( 24000 + (y - 1)) + ")/12) as timestamp_year, josh_mody_n * 0 + " + str( y) + " as timestamp_month from main.temp" + str(y) + ")" union_query = union_query + query results = pyblazing.run_query(union_query, tables2) Chronometer.show(chronometer, 'Create 12 month features once') return results
def create_joined_df(gdf, everdf, **kwargs): chronometer = Chronometer.makeStarted() tables = {"perf": gdf, "everdf": everdf} query = """SELECT perf.loan_id as loan_id, perf.monthly_reporting_period as mrp_timestamp, EXTRACT(MONTH FROM perf.monthly_reporting_period) as timestamp_month, EXTRACT(YEAR FROM perf.monthly_reporting_period) as timestamp_year, COALESCE(perf.current_loan_delinquency_status, -1) as delinquency_12, COALESCE(perf.current_actual_upb, 999999999.9) as upb_12, everdf.ever_30 as ever_30, everdf.ever_90 as ever_90, everdf.ever_180 as ever_180, COALESCE(everdf.delinquency_30, DATE '1970-01-01') as delinquency_30, COALESCE(everdf.delinquency_90, DATE '1970-01-01') as delinquency_90, COALESCE(everdf.delinquency_180, DATE '1970-01-01') as delinquency_180 FROM main.perf as perf LEFT OUTER JOIN main.everdf as everdf ON perf.loan_id = everdf.loan_id""" results = pyblazing.run_query(query, tables) Chronometer.show(chronometer, 'Create Joined DF') return results
def gpu_load_performance_csv(performance_path, **kwargs): """ Loads performance data Returns ------- GPU DataFrame """ chronometer = Chronometer.makeStarted() cols = [ "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb", "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity", "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code", "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after", "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs", "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds", "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds", "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag", "foreclosure_principal_write_off_amount", "servicing_activity_indicator" ] dtypes = OrderedDict([ ("loan_id", "int64"), ("monthly_reporting_period", "date"), ("servicer", "category"), ("interest_rate", "float64"), ("current_actual_upb", "float64"), ("loan_age", "float64"), ("remaining_months_to_legal_maturity", "float64"), ("adj_remaining_months_to_maturity", "float64"), ("maturity_date", "date"), ("msa", "float64"), ("current_loan_delinquency_status", "int32"), ("mod_flag", "category"), ("zero_balance_code", "category"), ("zero_balance_effective_date", "date"), ("last_paid_installment_date", "date"), ("foreclosed_after", "date"), ("disposition_date", "date"), ("foreclosure_costs", "float64"), ("prop_preservation_and_repair_costs", "float64"), ("asset_recovery_costs", "float64"), ("misc_holding_expenses", "float64"), ("holding_taxes", "float64"), ("net_sale_proceeds", "float64"), ("credit_enhancement_proceeds", "float64"), ("repurchase_make_whole_proceeds", "float64"), ("other_foreclosure_proceeds", "float64"), ("non_interest_bearing_upb", "float64"), ("principal_forgiveness_upb", "float64"), ("repurchase_make_whole_proceeds_flag", "category"), ("foreclosure_principal_write_off_amount", "float64"), ("servicing_activity_indicator", "category") ]) print(performance_path) performance_table = pyblazing.create_table( table_name='perf', type=get_type_schema(performance_path), path=performance_path, delimiter='|', names=cols, dtypes=get_dtype_values(dtypes), skip_rows=1) Chronometer.show(chronometer, 'Read Performance CSV') return performance_table
final_cpu_df_label = pd.concat( [final_cpu_df_label, cpu_df_label]) final_cpu_df_data = pd.concat([final_cpu_df_data, cpu_df_data]) xgb_convert_end_time = time.time() all_xgb_convert_times.append(xgb_convert_end_time - xgb_convert_start_time) data_train, data_test, label_train, label_test = train_test_split( final_cpu_df_data, final_cpu_df_label, test_size=0.20, random_state=42) xgdf_train = xgb.DMatrix(data_train, label_train) xgdf_test = xgb.DMatrix(data_test, label_test) chronometerTrain1 = Chronometer.makeStarted() startTime = time.time() bst = xgb.train(dxgb_gpu_params, xgdf_train, num_boost_round=dxgb_gpu_params['nround']) Chronometer.show(chronometerTrain1, 'Train 1') chronometerPredict1 = Chronometer.makeStarted() preds = bst.predict(xgdf_test) Chronometer.show(chronometerPredict1, 'Predict 1') labels = xgdf_test.get_label() print('prediction error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))))