def test_csv_reader_skiprows_skipfooter(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file5.csv") df = make_numpy_mixed_dataframe() df.to_csv(fname, columns=['Integer', 'Date', 'Float'], index=False, header=False) # Using engine='python' to eliminate pandas warning of using python engine. df_out = pd.read_csv(fname, names=['1', '2', '3'], parse_dates=[1], dayfirst=True, skiprows=1, skipfooter=1, engine='python') out = read_csv(str(fname), names=['1', '2', '3'], dtype=['int64', 'date', 'float64'], skiprows=1, skipfooter=1, dayfirst=True) assert len(out.columns) == len(df_out.columns) assert len(out) == len(df_out) pd.util.testing.assert_frame_equal(df_out, out.to_pandas())
def gpu_load_performance_csv(performance_path, **kwargs): """ Loads performance data Returns ------- GPU DataFrame """ cols = [ "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb", "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity", "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code", "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after", "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs", "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds", "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds", "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag", "foreclosure_principal_write_off_amount", "servicing_activity_indicator" ] dtypes = OrderedDict([ ("loan_id", "int64"), ("monthly_reporting_period", "date"), ("servicer", "category"), ("interest_rate", "float64"), ("current_actual_upb", "float64"), ("loan_age", "float64"), ("remaining_months_to_legal_maturity", "float64"), ("adj_remaining_months_to_maturity", "float64"), ("maturity_date", "date"), ("msa", "float64"), ("current_loan_delinquency_status", "int32"), ("mod_flag", "category"), ("zero_balance_code", "category"), ("zero_balance_effective_date", "date"), ("last_paid_installment_date", "date"), ("foreclosed_after", "date"), ("disposition_date", "date"), ("foreclosure_costs", "float64"), ("prop_preservation_and_repair_costs", "float64"), ("asset_recovery_costs", "float64"), ("misc_holding_expenses", "float64"), ("holding_taxes", "float64"), ("net_sale_proceeds", "float64"), ("credit_enhancement_proceeds", "float64"), ("repurchase_make_whole_proceeds", "float64"), ("other_foreclosure_proceeds", "float64"), ("non_interest_bearing_upb", "float64"), ("principal_forgiveness_upb", "float64"), ("repurchase_make_whole_proceeds_flag", "category"), ("foreclosure_principal_write_off_amount", "float64"), ("servicing_activity_indicator", "category") ]) print(performance_path) return pygdf.read_csv(performance_path, names=cols, delimiter='|', dtype=list(dtypes.values()), skiprows=1)
def test_csv_reader_numeric_data(dtype, nelem, tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file1.csv") df = make_numeric_dataframe(nelem, dtype) df.to_csv(fname, index=False, header=False) dtypes = [df[k].dtype for k in df.columns] out = read_csv(str(fname), names=list(df.columns.values), dtype=dtypes) assert len(out.columns) == len(df.columns) pd.util.testing.assert_frame_equal(df, out.to_pandas())
def gpu_load_acquisition_csv(acquisition_path, **kwargs): """ Loads acquisition data Returns ------- GPU DataFrame """ cols = [ 'loan_id', 'orig_channel', 'seller_name', 'orig_interest_rate', 'orig_upb', 'orig_loan_term', 'orig_date', 'first_pay_date', 'orig_ltv', 'orig_cltv', 'num_borrowers', 'dti', 'borrower_credit_score', 'first_home_buyer', 'loan_purpose', 'property_type', 'num_units', 'occupancy_status', 'property_state', 'zip', 'mortgage_insurance_percent', 'product_type', 'coborrow_credit_score', 'mortgage_insurance_type', 'relocation_mortgage_indicator' ] dtypes = OrderedDict([("loan_id", "int64"), ("orig_channel", "category"), ("seller_name", "category"), ("orig_interest_rate", "float64"), ("orig_upb", "int64"), ("orig_loan_term", "int64"), ("orig_date", "date"), ("first_pay_date", "date"), ("orig_ltv", "float64"), ("orig_cltv", "float64"), ("num_borrowers", "float64"), ("dti", "float64"), ("borrower_credit_score", "float64"), ("first_home_buyer", "category"), ("loan_purpose", "category"), ("property_type", "category"), ("num_units", "int64"), ("occupancy_status", "category"), ("property_state", "category"), ("zip", "int64"), ("mortgage_insurance_percent", "float64"), ("product_type", "category"), ("coborrow_credit_score", "float64"), ("mortgage_insurance_type", "float64"), ("relocation_mortgage_indicator", "category")]) print(acquisition_path) return pygdf.read_csv(acquisition_path, names=cols, delimiter='|', dtype=list(dtypes.values()), skiprows=1)
def test_csv_reader_mixed_data_delimiter(tmpdir): fname = tmpdir.mkdir("gdf_csv").join('tmp_csvreader_file3.csv') df = make_numpy_mixed_dataframe() df.to_csv(fname, sep='|', index=False, header=False) out = read_csv(str(fname), delimiter='|', names=['1', '2', '3', '4'], dtype=['int64', 'date', 'float64', 'category'], dayfirst=True) df_out = pd.read_csv(fname, delimiter='|', names=['1', '2', '3', '4'], parse_dates=[1], dayfirst=True) assert len(out.columns) == len(df_out.columns)
def test_csv_reader_datetime_data(tmpdir): fname = tmpdir.mkdir("gdf_csv").join('tmp_csvreader_file2.csv') df = make_datetime_dataframe() df.to_csv(fname, index=False, header=False) df_out = pd.read_csv(fname, names=['col1', 'col2'], parse_dates=[0, 1], dayfirst=True) dtypes = ['date', 'date'] out = read_csv(str(fname), names=list(df.columns.values), dtype=dtypes, dayfirst=True) assert len(out.columns) == len(df_out.columns) pd.util.testing.assert_frame_equal(df_out, out.to_pandas())
def test_csv_reader_all_numeric_dtypes(tmpdir): # fname = os.path.abspath('pygdf/tests/data/tmp_csvreader_file4.csv') fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file4.csv") df, gdf_dict, pd_dict = make_all_numeric_dtypes_dataframe() df.to_csv(fname, sep=',', index=False, header=False) out = read_csv(str(fname), delimiter=',', names=list(gdf_dict.keys()), dtype=gdf_dict) df_out = pd.read_csv(fname, delimiter=',', names=list(pd_dict.keys()), dtype=pd_dict, dayfirst=True) assert len(out.columns) == len(df_out.columns) pd.util.testing.assert_frame_equal(df_out, out.to_pandas())
def gpu_load_names(**kwargs): """ Loads names used for renaming the banks Returns ------- GPU DataFrame """ cols = ['seller_name', 'new'] dtypes = OrderedDict([ ("seller_name", "category"), ("new", "category"), ]) return pygdf.read_csv("/rapids/data/mortgage/names.csv", names=cols, delimiter='|', dtype=list(dtypes.values()), skiprows=1)