Example #1
0
def create_joined_df(gdf, everdf, **kwargs):
    chronometer = Chronometer.makeStarted()
    tables = {"perf": gdf, "everdf": everdf}

    query = """SELECT perf.loan_id as loan_id,
                perf.monthly_reporting_period as mrp_timestamp,
                EXTRACT(MONTH FROM perf.monthly_reporting_period)
                    as timestamp_month,
                EXTRACT(YEAR FROM perf.monthly_reporting_period)
                    as timestamp_year,
                COALESCE(perf.current_loan_delinquency_status, -1)
                    as delinquency_12,
                COALESCE(perf.current_actual_upb, 999999999.9) as upb_12,
                everdf.ever_30 as ever_30,
                everdf.ever_90 as ever_90,
                everdf.ever_180 as ever_180,
                COALESCE(everdf.delinquency_30, DATE '1970-01-01')
                    as delinquency_30,
                COALESCE(everdf.delinquency_90, DATE '1970-01-01')
                    as delinquency_90,
                COALESCE(everdf.delinquency_180, DATE '1970-01-01')
                    as delinquency_180
                FROM main.perf as perf
                LEFT OUTER JOIN main.everdf as everdf
                ON perf.loan_id = everdf.loan_id"""

    results = pyblazing.run_query(query, tables)
    Chronometer.show(chronometer, "Create Joined DF")
    return results
Example #2
0
def create_delinq_features(table, **kwargs):
    chronometer = Chronometer.makeStarted()
    query = """SELECT loan_id,
        min(monthly_reporting_period) as delinquency_30
        FROM main.perf where current_loan_delinquency_status >= 1 group by loan_id"""
    result_delinq_30 = pyblazing.run_query(query, {table.name: table.columns})

    query = """SELECT loan_id,
        min(monthly_reporting_period) as delinquency_90
        FROM main.perf where current_loan_delinquency_status >= 3 group by loan_id"""
    result_delinq_90 = pyblazing.run_query(query, {table.name: table.columns})

    query = """SELECT loan_id,
        min(monthly_reporting_period) as delinquency_180
        FROM main.perf where current_loan_delinquency_status >= 6 group by loan_id"""
    result_delinq_180 = pyblazing.run_query(query, {table.name: table.columns})

    new_tables = {
        "delinq_30": result_delinq_30.columns,
        "delinq_90": result_delinq_90.columns,
        "delinq_180": result_delinq_180.columns
    }
    query = """SELECT d30.loan_id, delinquency_30, COALESCE(delinquency_90, DATE '1970-01-01') as delinquency_90,
                COALESCE(delinquency_180, DATE '1970-01-01') as delinquency_180 FROM main.delinq_30 as d30
                LEFT OUTER JOIN main.delinq_90 as d90 ON d30.loan_id = d90.loan_id
                LEFT OUTER JOIN main.delinq_180 as d180 ON d30.loan_id = d180.loan_id"""
    result_merge = pyblazing.run_query(query, new_tables)
    Chronometer.show(chronometer, 'Create deliquency features')
    return result_merge
Example #3
0
def join_ever_delinq_features(everdf_tmp, delinq_merge, **kwargs):
    chronometer = Chronometer.makeStarted()
    tables = {"everdf": everdf_tmp, "delinq": delinq_merge}
    query = """
        SELECT everdf.loan_id as loan_id, ever_30, ever_90, ever_180,
            delinquency_30,
            delinquency_90,
            delinquency_180 FROM main.everdf as everdf
            LEFT OUTER JOIN main.delinq as delinq
            ON everdf.loan_id = delinq.loan_id
    """
    result_merge = pyblazing.run_query(query, tables)
    result_merge.columns["delinquency_30"] = result_merge.columns[
        "delinquency_30"].fillna(
            np.dtype("datetime64[ms]").type("1970-01-01").astype(
                "datetime64[ms]"))
    result_merge.columns["delinquency_90"] = result_merge.columns[
        "delinquency_90"].fillna(
            np.dtype("datetime64[ms]").type("1970-01-01").astype(
                "datetime64[ms]"))
    result_merge.columns["delinquency_180"] = result_merge.columns[
        "delinquency_180"].fillna(
            np.dtype("datetime64[ms]").type("1970-01-01").astype(
                "datetime64[ms]"))
    Chronometer.show(chronometer, "Create ever deliquency features")
    return result_merge
Example #4
0
def gpu_load_names(**kwargs):
    """ Loads names used for renaming the banks

    Returns
    -------
    GPU DataFrame
    """
    chronometer = Chronometer.makeStarted()

    cols = ['seller_name', 'new_seller_name']

    dtypes = OrderedDict([
        ("seller_name", "category"),
        ("new_seller_name", "category"),
    ])

    names_table = pyblazing.create_table(table_name='names',
                                         type=get_type_schema(col_names_path),
                                         path=col_names_path,
                                         delimiter='|',
                                         names=cols,
                                         dtypes=get_dtype_values(dtypes),
                                         skip_rows=1)
    Chronometer.show(chronometer, 'Read Names CSV')
    return names_table
Example #5
0
def gpu_load_names(col_names_path, **kwargs):
    """ Loads names used for renaming the banks

    Returns
    -------
    GPU DataFrame
    """
    chronometer = Chronometer.makeStarted()

    cols = ["seller_name", "new_seller_name"]

    dtypes = OrderedDict([
        ("seller_name", "category"),
        ("new_seller_name", "category"),
    ])
    new = col_names_path + "names.load"
    print(new)
    names_table = pyblazing.create_table(
        table_name="names",
        type=get_type_schema(new),
        path=new,
        delimiter="|",
        names=cols,
        dtypes=dtypes,  # TODO: dtypes=get_dtype_values(dtypes)
        skip_rows=1,
    )
    Chronometer.show(chronometer, "Read Names CSV")
    return names_table
Example #6
0
def final_performance_delinquency(gdf, joined_df, **kwargs):
    chronometer = Chronometer.makeStarted()
    tables = {"gdf": gdf, "joined_df": joined_df}
    query = """SELECT g.loan_id, current_actual_upb, current_loan_delinquency_status, delinquency_12, interest_rate, loan_age, mod_flag, msa, non_interest_bearing_upb 
        FROM main.gdf as g LEFT OUTER JOIN main.joined_df as j
        ON g.loan_id = j.loan_id and EXTRACT(YEAR FROM g.monthly_reporting_period) = j.timestamp_year and EXTRACT(MONTH FROM g.monthly_reporting_period) = j.timestamp_month """
    results = pyblazing.run_query(query, tables)
    Chronometer.show(chronometer, 'Final performance delinquency')
    return results
Example #7
0
def create_ever_features(table, **kwargs):
    chronometer = Chronometer.makeStarted()
    query = """SELECT loan_id,
        max(current_loan_delinquency_status) >= 1 as ever_30, 
        max(current_loan_delinquency_status) >= 3 as ever_90,
        max(current_loan_delinquency_status) >= 6 as ever_180
        FROM main.perf group by loan_id"""
    result = pyblazing.run_query(query, {table.name: table.columns})
    Chronometer.show(chronometer, 'Create Ever Features')
    return result
Example #8
0
def join_perf_acq_gdfs(perf, acq, **kwargs):
    chronometer = Chronometer.makeStarted()
    tables = {"perf": perf, "acq": acq}
    query = """SELECT p.loan_id, current_actual_upb, current_loan_delinquency_status, delinquency_12, interest_rate, loan_age, mod_flag, msa, non_interest_bearing_upb,
     borrower_credit_score, dti, first_home_buyer, loan_purpose, mortgage_insurance_percent, num_borrowers, num_units, occupancy_status, 
     orig_channel, orig_cltv, orig_date, orig_interest_rate, orig_loan_term, orig_ltv, orig_upb, product_type, property_state, property_type, 
     relocation_mortgage_indicator, seller_name, zip FROM main.perf as p LEFT OUTER JOIN main.acq as a ON p.loan_id = a.loan_id"""
    results = pyblazing.run_query(query, tables)
    Chronometer.show(chronometer, 'Join performance acquitistion gdfs')
    return results
Example #9
0
def join_ever_delinq_features(everdf_tmp, delinq_merge, **kwargs):
    chronometer = Chronometer.makeStarted()
    tables = {"everdf": everdf_tmp, "delinq": delinq_merge}
    query = """SELECT everdf.loan_id as loan_id, ever_30, ever_90, ever_180,
                  COALESCE(delinquency_30, DATE '1970-01-01') as delinquency_30,
                  COALESCE(delinquency_90, DATE '1970-01-01') as delinquency_90,
                  COALESCE(delinquency_180, DATE '1970-01-01') as delinquency_180 FROM main.everdf as everdf
                  LEFT OUTER JOIN main.delinq as delinq ON everdf.loan_id = delinq.loan_id"""
    result_merge = pyblazing.run_query(query, tables)
    Chronometer.show(chronometer, 'Create ever deliquency features')
    return result_merge
Example #10
0
def combine_joined_12_mon(joined_df, testdf, **kwargs):
    chronometer = Chronometer.makeStarted()
    tables = {"joined_df": joined_df, "testdf": testdf}
    query = """SELECT j.loan_id, j.mrp_timestamp, j.timestamp_month, j.timestamp_year, 
                j.ever_30, j.ever_90, j.ever_180, j.delinquency_30, j.delinquency_90, j.delinquency_180,
                t.delinquency_12, t.upb_12 
                FROM main.joined_df as j LEFT OUTER JOIN main.testdf as t 
                ON j.loan_id = t.loan_id and j.timestamp_year = t.timestamp_year and j.timestamp_month = t.timestamp_month"""
    results = pyblazing.run_query(query, tables)
    Chronometer.show(chronometer, 'Combine joind 12 month')
    return results
Example #11
0
def last_mile_cleaning(df, **kwargs):
    chronometer = Chronometer.makeStarted()
    for col, dtype in df.dtypes.iteritems():
        if str(dtype) == "category":
            df[col] = df[col].cat.codes
        df[col] = df[col].astype("float32")
    df["delinquency_12"] = df["delinquency_12"] > 0
    df["delinquency_12"] = df["delinquency_12"].fillna(False).astype("int32")
    for column in df.columns:
        df[column] = df[column].fillna(-1)
    Chronometer.show(chronometer, "Last mile cleaning")
    return df
Example #12
0
def last_mile_cleaning(df, **kwargs):
    chronometer = Chronometer.makeStarted()
    for col, dtype in df.dtypes.iteritems():
        if str(dtype) == 'category':
            df[col] = df[col].cat.codes
        df[col] = df[col].astype('float32')
    df['delinquency_12'] = df['delinquency_12'] > 0
    df['delinquency_12'] = df['delinquency_12'].fillna(False).astype('int32')
    for column in df.columns:
        df[column] = df[column].fillna(-1)
    Chronometer.show(chronometer, 'Last mile cleaning')
    return df
Example #13
0
def gpu_load_acquisition_csv(acquisition_path, **kwargs):
    """ Loads acquisition data

    Returns
    -------
    GPU DataFrame
    """
    chronometer = Chronometer.makeStarted()

    cols = [
        'loan_id', 'orig_channel', 'seller_name', 'orig_interest_rate',
        'orig_upb', 'orig_loan_term', 'orig_date', 'first_pay_date',
        'orig_ltv', 'orig_cltv', 'num_borrowers', 'dti',
        'borrower_credit_score', 'first_home_buyer', 'loan_purpose',
        'property_type', 'num_units', 'occupancy_status', 'property_state',
        'zip', 'mortgage_insurance_percent', 'product_type',
        'coborrow_credit_score', 'mortgage_insurance_type',
        'relocation_mortgage_indicator'
    ]

    dtypes = OrderedDict([("loan_id", "int64"), ("orig_channel", "category"),
                          ("seller_name", "category"),
                          ("orig_interest_rate", "float64"),
                          ("orig_upb", "int64"), ("orig_loan_term", "int64"),
                          ("orig_date", "date"), ("first_pay_date", "date"),
                          ("orig_ltv", "float64"), ("orig_cltv", "float64"),
                          ("num_borrowers", "float64"), ("dti", "float64"),
                          ("borrower_credit_score", "float64"),
                          ("first_home_buyer", "category"),
                          ("loan_purpose", "category"),
                          ("property_type", "category"),
                          ("num_units", "int64"),
                          ("occupancy_status", "category"),
                          ("property_state", "category"), ("zip", "int64"),
                          ("mortgage_insurance_percent", "float64"),
                          ("product_type", "category"),
                          ("coborrow_credit_score", "float64"),
                          ("mortgage_insurance_type", "float64"),
                          ("relocation_mortgage_indicator", "category")])

    print(acquisition_path)

    acquisition_table = pyblazing.create_table(
        table_name='acq',
        type=get_type_schema(acquisition_path),
        path=acquisition_path,
        delimiter='|',
        names=cols,
        dtypes=get_dtype_values(dtypes),
        skip_rows=1)
    Chronometer.show(chronometer, 'Read Acquisition CSV')
    return acquisition_table
Example #14
0
def create_12_mon_features_union(joined_df, **kwargs):
    chronometer = Chronometer.makeStarted()
    tables = {"joined_df": joined_df}
    josh_mody_n_str = "timestamp_year * 12 + timestamp_month - 24000.0"
    query = ("SELECT loan_id, " + josh_mody_n_str +
             " as josh_mody_n, max(delinquency_12) as max_d12," +
             " min(upb_12) as min_upb_12" +
             " FROM main.joined_df as joined_df GROUP BY loan_id, " +
             josh_mody_n_str)
    mastertemp = pyblazing.run_query(query, tables)

    all_temps = []
    all_tokens = []
    tables = {"joined_df": mastertemp.columns}
    n_months = 12

    for y in range(1, n_months + 1):
        josh_mody_n_str = "floor((josh_mody_n - " + str(y) + ")/12.0)"
        query = ("SELECT loan_id, " + josh_mody_n_str +
                 " as josh_mody_n, max(max_d12) > 3 as max_d12_gt3," +
                 " min(min_upb_12) = 0 as min_upb_12_eq0," +
                 " min(min_upb_12) as upb_12" +
                 " FROM main.joined_df as joined_df GROUP BY loan_id, " +
                 josh_mody_n_str)

        metaToken = pyblazing.run_query_get_token(query, tables)
        all_tokens.append(metaToken)

    for metaToken in all_tokens:
        temp = pyblazing.run_query_get_results(metaToken)
        all_temps.append(temp)

    y = 1
    tables2 = {"temp1": all_temps[0].columns}
    union_query = (
        """(SELECT loan_id, max_d12_gt3 + min_upb_12_eq0 as delinquency_12,
         upb_12, floor(((josh_mody_n * 12) + """ + str(24000 + (y - 1)) +
        ")/12) as timestamp_year, josh_mody_n * 0 + " + str(y) +
        " as timestamp_month from main.temp" + str(y) + ")")
    for y in range(2, n_months + 1):
        tables2["temp" + str(y)] = all_temps[y - 1].columns
        query = (""" UNION ALL (SELECT loan_id, max_d12_gt3 + min_upb_12_eq0 as
            delinquency_12, upb_12, floor(((josh_mody_n * 12) + """ +
                 str(24000 +
                     (y - 1)) + ")/12) as timestamp_year, josh_mody_n * 0 + " +
                 str(y) + " as timestamp_month from main.temp" + str(y) + ")")
        union_query = union_query + query

    results = pyblazing.run_query(union_query, tables2)
    Chronometer.show(chronometer, "Create 12 month features once")
    return results
Example #15
0
def create_delinq_features(table, **kwargs):
    chronometer = Chronometer.makeStarted()
    query = """
        SELECT loan_id,
            min(monthly_reporting_period) as delinquency_30
        FROM main.perf
        where current_loan_delinquency_status >= 1 group by loan_id
    """
    result_delinq_30 = pyblazing.run_query(query, {table.name: table.columns})

    query = """
        SELECT loan_id,
            min(monthly_reporting_period) as delinquency_90
        FROM main.perf
        where current_loan_delinquency_status >= 3 group by loan_id
    """
    result_delinq_90 = pyblazing.run_query(query, {table.name: table.columns})

    query = """
        SELECT loan_id,
            min(monthly_reporting_period) as delinquency_180
        FROM main.perf
        where current_loan_delinquency_status >= 6 group by loan_id
    """
    result_delinq_180 = pyblazing.run_query(query, {table.name: table.columns})

    new_tables = {
        "delinq_30": result_delinq_30.columns,
        "delinq_90": result_delinq_90.columns,
        "delinq_180": result_delinq_180.columns,
    }
    query = """
        SELECT d30.loan_id, delinquency_30, delinquency_90,
            delinquency_180 FROM main.delinq_30 as d30
        LEFT OUTER JOIN main.delinq_90 as d90 ON d30.loan_id = d90.loan_id
        LEFT OUTER JOIN main.delinq_180 as d180 ON d30.loan_id = d180.loan_id
    """
    result_merge = pyblazing.run_query(query, new_tables)
    result_merge.columns["delinquency_90"] = result_merge.columns[
        "delinquency_90"].fillna(
            np.dtype("datetime64[ms]").type("1970-01-01").astype(
                "datetime64[ms]"))
    result_merge.columns["delinquency_180"] = result_merge.columns[
        "delinquency_180"].fillna(
            np.dtype("datetime64[ms]").type("1970-01-01").astype(
                "datetime64[ms]"))
    Chronometer.show(chronometer, "Create deliquency features")
    return result_merge
Example #16
0
def merge_names(names_table, acq_table):
    chronometer = Chronometer.makeStarted()
    tables = {
        names_table.name: names_table.columns,
        acq_table.name: acq_table.columns
    }

    query = """SELECT loan_id, orig_channel, orig_interest_rate, orig_upb, orig_loan_term, 
        orig_date, first_pay_date, orig_ltv, orig_cltv, num_borrowers, dti, borrower_credit_score, 
        first_home_buyer, loan_purpose, property_type, num_units, occupancy_status, property_state,
        zip, mortgage_insurance_percent, product_type, coborrow_credit_score, mortgage_insurance_type, 
        relocation_mortgage_indicator, new_seller_name as seller_name 
        FROM main.acq as a LEFT OUTER JOIN main.names as n ON  a.seller_name = n.seller_name"""
    result = pyblazing.run_query(query, tables)
    Chronometer.show(chronometer, 'Create Acquisition (Merge Names)')
    return result
Example #17
0
def create_joined_df(gdf, everdf, **kwargs):
    chronometer = Chronometer.makeStarted()
    tables = {"perf": gdf, "everdf": everdf}

    query = """SELECT perf.loan_id as loan_id, 
                perf.monthly_reporting_period as mrp_timestamp,
                EXTRACT(MONTH FROM perf.monthly_reporting_period) as timestamp_month,
                EXTRACT(YEAR FROM perf.monthly_reporting_period) as timestamp_year,
                perf.current_loan_delinquency_status as delinquency_12,
                perf.current_actual_upb as upb_12,
                everdf.ever_30 as ever_30,
                everdf.ever_90 as ever_90, 
                everdf.ever_180 as ever_180, 
                everdf.delinquency_30 as delinquency_30, 
                everdf.delinquency_90 as delinquency_90, 
                everdf.delinquency_180 as delinquency_180
                FROM main.perf as perf 
                LEFT OUTER JOIN main.everdf as everdf ON perf.loan_id = everdf.loan_id"""

    results = pyblazing.run_query(query, tables)

    results.columns['upb_12'] = results.columns['upb_12'].fillna(999999999)
    results.columns['delinquency_12'] = results.columns[
        'delinquency_12'].fillna(-1)
    results.columns['ever_30'] = results.columns['ever_30'].fillna(-1)
    results.columns['ever_90'] = results.columns['ever_90'].fillna(-1)
    results.columns['ever_180'] = results.columns['ever_180'].fillna(-1)
    results.columns['delinquency_30'] = results.columns[
        'delinquency_30'].fillna(-1)
    results.columns['delinquency_90'] = results.columns[
        'delinquency_90'].fillna(-1)
    results.columns['delinquency_180'] = results.columns[
        'delinquency_180'].fillna(-1)

    Chronometer.show(chronometer, 'Create Joined DF')
    return results
Example #18
0
def gpu_load_acquisition_csv(acquisition_path, **kwargs):
    """ Loads acquisition data

    Returns
    -------
    GPU DataFrame
    """
    chronometer = Chronometer.makeStarted()

    cols = [
        "loan_id",
        "orig_channel",
        "seller_name",
        "orig_interest_rate",
        "orig_upb",
        "orig_loan_term",
        "orig_date",
        "first_pay_date",
        "orig_ltv",
        "orig_cltv",
        "num_borrowers",
        "dti",
        "borrower_credit_score",
        "first_home_buyer",
        "loan_purpose",
        "property_type",
        "num_units",
        "occupancy_status",
        "property_state",
        "zip",
        "mortgage_insurance_percent",
        "product_type",
        "coborrow_credit_score",
        "mortgage_insurance_type",
        "relocation_mortgage_indicator",
    ]

    dtypes = OrderedDict([
        ("loan_id", "int64"),
        ("orig_channel", "category"),
        ("seller_name", "category"),
        ("orig_interest_rate", "float64"),
        ("orig_upb", "int64"),
        ("orig_loan_term", "int64"),
        ("orig_date", "date"),
        ("first_pay_date", "date"),
        ("orig_ltv", "float64"),
        ("orig_cltv", "float64"),
        ("num_borrowers", "float64"),
        ("dti", "float64"),
        ("borrower_credit_score", "float64"),
        ("first_home_buyer", "category"),
        ("loan_purpose", "category"),
        ("property_type", "category"),
        ("num_units", "int64"),
        ("occupancy_status", "category"),
        ("property_state", "category"),
        ("zip", "int64"),
        ("mortgage_insurance_percent", "float64"),
        ("product_type", "category"),
        ("coborrow_credit_score", "float64"),
        ("mortgage_insurance_type", "float64"),
        ("relocation_mortgage_indicator", "category"),
    ])

    print(acquisition_path)

    acquisition_table = pyblazing.create_table(
        table_name="acq",
        type=get_type_schema(acquisition_path),
        path=acquisition_path,
        delimiter="|",
        names=cols,
        dtypes=dtypes,  # TODO: dtypes=get_dtype_values(dtypes)
        skip_rows=1,
    )
    Chronometer.show(chronometer, "Read Acquisition CSV")
    return acquisition_table
Example #19
0
def gpu_load_performance_csv(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    GPU DataFrame
    """
    chronometer = Chronometer.makeStarted()

    cols = [
        "loan_id",
        "monthly_reporting_period",
        "servicer",
        "interest_rate",
        "current_actual_upb",
        "loan_age",
        "remaining_months_to_legal_maturity",
        "adj_remaining_months_to_maturity",
        "maturity_date",
        "msa",
        "current_loan_delinquency_status",
        "mod_flag",
        "zero_balance_code",
        "zero_balance_effective_date",
        "last_paid_installment_date",
        "foreclosed_after",
        "disposition_date",
        "foreclosure_costs",
        "prop_preservation_and_repair_costs",
        "asset_recovery_costs",
        "misc_holding_expenses",
        "holding_taxes",
        "net_sale_proceeds",
        "credit_enhancement_proceeds",
        "repurchase_make_whole_proceeds",
        "other_foreclosure_proceeds",
        "non_interest_bearing_upb",
        "principal_forgiveness_upb",
        "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount",
        "servicing_activity_indicator",
    ]

    dtypes = OrderedDict([
        ("loan_id", "int64"),
        ("monthly_reporting_period", "date"),
        ("servicer", "category"),
        ("interest_rate", "float64"),
        ("current_actual_upb", "float64"),
        ("loan_age", "float64"),
        ("remaining_months_to_legal_maturity", "float64"),
        ("adj_remaining_months_to_maturity", "float64"),
        ("maturity_date", "date"),
        ("msa", "float64"),
        ("current_loan_delinquency_status", "int32"),
        ("mod_flag", "category"),
        ("zero_balance_code", "category"),
        ("zero_balance_effective_date", "date"),
        ("last_paid_installment_date", "date"),
        ("foreclosed_after", "date"),
        ("disposition_date", "date"),
        ("foreclosure_costs", "float64"),
        ("prop_preservation_and_repair_costs", "float64"),
        ("asset_recovery_costs", "float64"),
        ("misc_holding_expenses", "float64"),
        ("holding_taxes", "float64"),
        ("net_sale_proceeds", "float64"),
        ("credit_enhancement_proceeds", "float64"),
        ("repurchase_make_whole_proceeds", "float64"),
        ("other_foreclosure_proceeds", "float64"),
        ("non_interest_bearing_upb", "float64"),
        ("principal_forgiveness_upb", "float64"),
        ("repurchase_make_whole_proceeds_flag", "category"),
        ("foreclosure_principal_write_off_amount", "float64"),
        ("servicing_activity_indicator", "category"),
    ])
    print(performance_path)
    performance_table = pyblazing.create_table(
        table_name="perf",
        type=get_type_schema(performance_path),
        path=performance_path,
        delimiter="|",
        names=cols,
        dtypes=dtypes,  # TODO: dtypes=get_dtype_values(dtypes)
        skip_rows=1,
    )
    Chronometer.show(chronometer, "Read Performance CSV")
    return performance_table
Example #20
0
            all_xgb_convert_times.append(
                xgb_convert_end_time - xgb_convert_start_time)

data_train, data_test, label_train, label_test = train_test_split(
    final_cpu_df_data, final_cpu_df_label, test_size=0.20, random_state=42
)

xgdf_train = xgb.DMatrix(data_train, label_train)
xgdf_test = xgb.DMatrix(data_test, label_test)

chronometerTrain1 = Chronometer.makeStarted()
startTime = time.time()
bst = xgb.train(dxgb_gpu_params, xgdf_train,
                num_boost_round=dxgb_gpu_params["nround"])
Chronometer.show(chronometerTrain1, "Train 1")

chronometerPredict1 = Chronometer.makeStarted()
preds = bst.predict(xgdf_test)
Chronometer.show(chronometerPredict1, "Predict 1")

labels = xgdf_test.get_label()
print(
    "prediction error=%f"
    % (
        sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
        / float(len(preds))
    )
)

endTime = time.time()
Example #21
0
            all_xgb_convert_times.append(xgb_convert_end_time -
                                         xgb_convert_start_time)

data_train, data_test, label_train, label_test = train_test_split(
    final_cpu_df_data, final_cpu_df_label, test_size=0.20, random_state=42)

xgdf_train = xgb.DMatrix(data_train, label_train)
xgdf_test = xgb.DMatrix(data_test, label_test)

chronometerTrain1 = Chronometer.makeStarted()
startTime = time.time()
bst = xgb.train(dxgb_gpu_params,
                xgdf_train,
                num_boost_round=dxgb_gpu_params['nround'])
Chronometer.show(chronometerTrain1, 'Train 1')

chronometerPredict1 = Chronometer.makeStarted()
preds = bst.predict(xgdf_test)
Chronometer.show(chronometerPredict1, 'Predict 1')

labels = xgdf_test.get_label()
print('prediction error=%f' %
      (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) /
       float(len(preds))))

endTime = time.time()
trainPredict_time = (endTime - startTime)

print("TIMES SUMMARY")
print('LOAD Time: %fs' % sum(all_load_times))