def import_taxi_records(dh_session: Session) -> Table: # download the CSV data and read it into a pyarrow table and prepare it for uploading into DH csv_file_name = download_csv( url= "https://nyc-tlc.s3.amazonaws.com/trip+data/yellow_tripdata_2020-12.csv" ) pa_table = csv.read_csv(csv_file_name) # drop unwanted columns unwanted_columns = [ "tpep_pickup_datetime", "tpep_dropoff_datetime", "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID" ] pa_table = pa_table.drop(unwanted_columns) # drop any column with a unsupported data type for column, column_name in zip(pa_table.columns, pa_table.column_names): if not is_deephaven_compatible(column.type): print( f"drop column: {column_name} because of unsupported data type {column.type}" ) pa_table = pa_table.drop([column_name]) # upload the pyarrow table to the Deephaven server return dh_session.import_table(pa_table)
def test_merge_tables(self): session = Session() pa_table = csv.read_csv(self.csv_file) table1 = session.import_table(pa_table) table2 = table1.group_by(by=["a", "c"]).ungroup(cols=["b", "d", "e"]) table3 = table1.where(["a % 2 > 0 && b % 3 == 1"]) result_table = session.merge_tables(tables=[table1, table2, table3], order_by="a") self.assertTrue(result_table.size > table1.size) self.assertTrue(result_table.size > table2.size) self.assertTrue(result_table.size > table3.size)