Exemple #1
0
def test_csv_reader_skiprows_skipfooter(tmpdir):

    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file5.csv")

    df = make_numpy_mixed_dataframe()
    df.to_csv(fname,
              columns=['Integer', 'Date', 'Float'],
              index=False,
              header=False)

    # Using engine='python' to eliminate pandas warning of using python engine.
    df_out = pd.read_csv(fname,
                         names=['1', '2', '3'],
                         parse_dates=[1],
                         dayfirst=True,
                         skiprows=1,
                         skipfooter=1,
                         engine='python')
    out = read_csv(str(fname),
                   names=['1', '2', '3'],
                   dtype=['int64', 'date', 'float64'],
                   skiprows=1,
                   skipfooter=1,
                   dayfirst=True)

    assert len(out.columns) == len(df_out.columns)
    assert len(out) == len(df_out)
    pd.util.testing.assert_frame_equal(df_out, out.to_pandas())
def gpu_load_performance_csv(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    GPU DataFrame
    """

    cols = [
        "loan_id", "monthly_reporting_period", "servicer", "interest_rate",
        "current_actual_upb", "loan_age", "remaining_months_to_legal_maturity",
        "adj_remaining_months_to_maturity", "maturity_date", "msa",
        "current_loan_delinquency_status", "mod_flag", "zero_balance_code",
        "zero_balance_effective_date", "last_paid_installment_date",
        "foreclosed_after", "disposition_date", "foreclosure_costs",
        "prop_preservation_and_repair_costs", "asset_recovery_costs",
        "misc_holding_expenses", "holding_taxes", "net_sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_make_whole_proceeds",
        "other_foreclosure_proceeds", "non_interest_bearing_upb",
        "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount",
        "servicing_activity_indicator"
    ]

    dtypes = OrderedDict([
        ("loan_id", "int64"), ("monthly_reporting_period", "date"),
        ("servicer", "category"), ("interest_rate", "float64"),
        ("current_actual_upb", "float64"), ("loan_age", "float64"),
        ("remaining_months_to_legal_maturity", "float64"),
        ("adj_remaining_months_to_maturity", "float64"),
        ("maturity_date", "date"), ("msa", "float64"),
        ("current_loan_delinquency_status", "int32"), ("mod_flag", "category"),
        ("zero_balance_code", "category"),
        ("zero_balance_effective_date", "date"),
        ("last_paid_installment_date", "date"), ("foreclosed_after", "date"),
        ("disposition_date", "date"), ("foreclosure_costs", "float64"),
        ("prop_preservation_and_repair_costs", "float64"),
        ("asset_recovery_costs", "float64"),
        ("misc_holding_expenses", "float64"), ("holding_taxes", "float64"),
        ("net_sale_proceeds", "float64"),
        ("credit_enhancement_proceeds", "float64"),
        ("repurchase_make_whole_proceeds", "float64"),
        ("other_foreclosure_proceeds", "float64"),
        ("non_interest_bearing_upb", "float64"),
        ("principal_forgiveness_upb", "float64"),
        ("repurchase_make_whole_proceeds_flag", "category"),
        ("foreclosure_principal_write_off_amount", "float64"),
        ("servicing_activity_indicator", "category")
    ])

    print(performance_path)

    return pygdf.read_csv(performance_path,
                          names=cols,
                          delimiter='|',
                          dtype=list(dtypes.values()),
                          skiprows=1)
Exemple #3
0
def test_csv_reader_numeric_data(dtype, nelem, tmpdir):

    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file1.csv")

    df = make_numeric_dataframe(nelem, dtype)
    df.to_csv(fname, index=False, header=False)

    dtypes = [df[k].dtype for k in df.columns]
    out = read_csv(str(fname), names=list(df.columns.values), dtype=dtypes)

    assert len(out.columns) == len(df.columns)
    pd.util.testing.assert_frame_equal(df, out.to_pandas())
def gpu_load_acquisition_csv(acquisition_path, **kwargs):
    """ Loads acquisition data

    Returns
    -------
    GPU DataFrame
    """

    cols = [
        'loan_id', 'orig_channel', 'seller_name', 'orig_interest_rate',
        'orig_upb', 'orig_loan_term', 'orig_date', 'first_pay_date',
        'orig_ltv', 'orig_cltv', 'num_borrowers', 'dti',
        'borrower_credit_score', 'first_home_buyer', 'loan_purpose',
        'property_type', 'num_units', 'occupancy_status', 'property_state',
        'zip', 'mortgage_insurance_percent', 'product_type',
        'coborrow_credit_score', 'mortgage_insurance_type',
        'relocation_mortgage_indicator'
    ]

    dtypes = OrderedDict([("loan_id", "int64"), ("orig_channel", "category"),
                          ("seller_name", "category"),
                          ("orig_interest_rate", "float64"),
                          ("orig_upb", "int64"), ("orig_loan_term", "int64"),
                          ("orig_date", "date"), ("first_pay_date", "date"),
                          ("orig_ltv", "float64"), ("orig_cltv", "float64"),
                          ("num_borrowers", "float64"), ("dti", "float64"),
                          ("borrower_credit_score", "float64"),
                          ("first_home_buyer", "category"),
                          ("loan_purpose", "category"),
                          ("property_type", "category"),
                          ("num_units", "int64"),
                          ("occupancy_status", "category"),
                          ("property_state", "category"), ("zip", "int64"),
                          ("mortgage_insurance_percent", "float64"),
                          ("product_type", "category"),
                          ("coborrow_credit_score", "float64"),
                          ("mortgage_insurance_type", "float64"),
                          ("relocation_mortgage_indicator", "category")])

    print(acquisition_path)

    return pygdf.read_csv(acquisition_path,
                          names=cols,
                          delimiter='|',
                          dtype=list(dtypes.values()),
                          skiprows=1)
Exemple #5
0
def test_csv_reader_mixed_data_delimiter(tmpdir):

    fname = tmpdir.mkdir("gdf_csv").join('tmp_csvreader_file3.csv')

    df = make_numpy_mixed_dataframe()
    df.to_csv(fname, sep='|', index=False, header=False)

    out = read_csv(str(fname),
                   delimiter='|',
                   names=['1', '2', '3', '4'],
                   dtype=['int64', 'date', 'float64', 'category'],
                   dayfirst=True)
    df_out = pd.read_csv(fname,
                         delimiter='|',
                         names=['1', '2', '3', '4'],
                         parse_dates=[1],
                         dayfirst=True)

    assert len(out.columns) == len(df_out.columns)
Exemple #6
0
def test_csv_reader_datetime_data(tmpdir):

    fname = tmpdir.mkdir("gdf_csv").join('tmp_csvreader_file2.csv')

    df = make_datetime_dataframe()
    df.to_csv(fname, index=False, header=False)

    df_out = pd.read_csv(fname,
                         names=['col1', 'col2'],
                         parse_dates=[0, 1],
                         dayfirst=True)
    dtypes = ['date', 'date']
    out = read_csv(str(fname),
                   names=list(df.columns.values),
                   dtype=dtypes,
                   dayfirst=True)

    assert len(out.columns) == len(df_out.columns)
    pd.util.testing.assert_frame_equal(df_out, out.to_pandas())
Exemple #7
0
def test_csv_reader_all_numeric_dtypes(tmpdir):

    # fname = os.path.abspath('pygdf/tests/data/tmp_csvreader_file4.csv')
    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file4.csv")

    df, gdf_dict, pd_dict = make_all_numeric_dtypes_dataframe()
    df.to_csv(fname, sep=',', index=False, header=False)

    out = read_csv(str(fname),
                   delimiter=',',
                   names=list(gdf_dict.keys()),
                   dtype=gdf_dict)
    df_out = pd.read_csv(fname,
                         delimiter=',',
                         names=list(pd_dict.keys()),
                         dtype=pd_dict,
                         dayfirst=True)

    assert len(out.columns) == len(df_out.columns)
    pd.util.testing.assert_frame_equal(df_out, out.to_pandas())
def gpu_load_names(**kwargs):
    """ Loads names used for renaming the banks

    Returns
    -------
    GPU DataFrame
    """

    cols = ['seller_name', 'new']

    dtypes = OrderedDict([
        ("seller_name", "category"),
        ("new", "category"),
    ])

    return pygdf.read_csv("/rapids/data/mortgage/names.csv",
                          names=cols,
                          delimiter='|',
                          dtype=list(dtypes.values()),
                          skiprows=1)