def fit_on_spark(self, train_df: DF, evaluate_df: OPTIONAL_DF = None, fs_directory: Optional[str] = None, compression: Optional[str] = None) -> NoReturn: super().fit_on_spark(train_df, evaluate_df) train_df = self._check_and_convert(train_df) if evaluate_df is not None: evaluate_df = self._check_and_convert(evaluate_df) train_ds = RayMLDataset.from_spark( train_df, self._num_workers, self._batch_size, fs_directory, compression) evaluate_ds = None if evaluate_df is not None: evaluate_ds = RayMLDataset.from_spark( evaluate_df, self._num_workers, self._batch_size, fs_directory, compression) return self.fit(train_ds, evaluate_ds)
def fit_on_spark(self, train_df: DF, evaluate_df: OPTIONAL_DF = None, fs_directory: Optional[str] = None, compression: Optional[str] = None, num_steps=None, profile=False, reduce_results=True, max_retries=3, info=None): super().fit_on_spark(train_df, evaluate_df) train_df = self._check_and_convert(train_df) if evaluate_df is not None: evaluate_df = self._check_and_convert(evaluate_df) train_ds = RayMLDataset.from_spark( train_df, self._num_workers, self._shuffle, None, fs_directory, compression) evaluate_ds = None if evaluate_df is not None: evaluate_ds = RayMLDataset.from_spark( evaluate_df, self._num_workers, self._shuffle, None, fs_directory, compression) return self.fit( train_ds, evaluate_ds, num_steps, profile, reduce_results, max_retries, info)
def process_data(): app_name = "NYC Taxi Fare Prediction with RayDP" num_executors = 1 cores_per_executor = 1 memory_per_executor = "500M" # Use RayDP to perform data processing spark = raydp.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor) data = spark.read.format("csv").option("header", "true") \ .option("inferSchema", "true") \ .load(NYC_TRAIN_CSV) # Set spark timezone for processing datetime spark.conf.set("spark.sql.session.timeZone", "UTC") data = nyc_taxi_preprocess(data) ds = RayMLDataset.from_spark(data, 1, args.batch_size) features = [ field.name for field in list(data.schema) if field.name != "fare_amount" ] return ds.to_torch(feature_columns=features, label_column="fare_amount"), len(features)
num_executors = 1 cores_per_executor = 1 memory_per_executor = "500M" spark = raydp.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor) data = spark.read.format("csv").option("header", "true") \ .option("inferSchema", "true") \ .load(NYC_TRAIN_CSV) # Set spark timezone for processing datetime spark.conf.set("spark.sql.session.timeZone", "UTC") # Transform the dataset data = nyc_taxi_preprocess(data) # Split data into train_dataset and test_dataset train_df, test_df = random_split(data, [0.9, 0.1], 0) # Convert spark dataframe into ML Dataset train_dataset = RayMLDataset.from_spark(train_df, 2, 32) test_dataset = RayMLDataset.from_spark(test_df, 2, 32) # Then convert them into DMatrix used by xgboost dtrain = RayDMatrix(train_dataset, label='fare_amount') dtest = RayDMatrix(test_dataset, label='fare_amount') # Configure the XGBoost model config = { "tree_method": "hist", "eval_metric": ["logloss", "error"], } evals_result = {} # Train the model bst = train(config, dtrain, evals=[(dtest, "eval")], evals_result=evals_result,
# Here we just use a subset of the training data data = spark.read.format("csv").option("header", "true") \ .option("inferSchema", "true") \ .load(NYC_TRAIN_CSV) # Set spark timezone for processing datetime spark.conf.set("spark.sql.session.timeZone", "UTC") # Transform the dataset data = nyc_taxi_preprocess(data) # Split data into train_dataset and test_dataset train_df, test_df = random_split(data, [0.9, 0.1], 0) features = [ field.name for field in list(train_df.schema) if field.name != "fare_amount" ] # Convert spark dataframe into ML Dataset train_dataset = RayMLDataset.from_spark(train_df, num_executors, 32) test_dataset = RayMLDataset.from_spark(test_df, num_executors, 32) # Then convert to torch datasets train_dataset = train_dataset.to_torch(feature_columns=features, label_column="fare_amount") test_dataset = test_dataset.to_torch(feature_columns=features, label_column="fare_amount") # Define a neural network model class NYC_Model(nn.Module): def __init__(self, cols): super(NYC_Model, self).__init__() self.fc1 = nn.Linear(cols, 256) self.fc2 = nn.Linear(256, 128)