Example #1
0
 def fit_on_spark(self,
                  train_df: DF,
                  evaluate_df: OPTIONAL_DF = None,
                  fs_directory: Optional[str] = None,
                  compression: Optional[str] = None) -> NoReturn:
     super().fit_on_spark(train_df, evaluate_df)
     train_df = self._check_and_convert(train_df)
     if evaluate_df is not None:
         evaluate_df = self._check_and_convert(evaluate_df)
     train_ds = RayMLDataset.from_spark(
         train_df, self._num_workers, self._batch_size, fs_directory, compression)
     evaluate_ds = None
     if evaluate_df is not None:
         evaluate_ds = RayMLDataset.from_spark(
             evaluate_df, self._num_workers, self._batch_size, fs_directory, compression)
     return self.fit(train_ds, evaluate_ds)
Example #2
0
 def fit_on_spark(self,
                  train_df: DF,
                  evaluate_df: OPTIONAL_DF = None,
                  fs_directory: Optional[str] = None,
                  compression: Optional[str] = None,
                  num_steps=None,
                  profile=False,
                  reduce_results=True,
                  max_retries=3,
                  info=None):
     super().fit_on_spark(train_df, evaluate_df)
     train_df = self._check_and_convert(train_df)
     if evaluate_df is not None:
         evaluate_df = self._check_and_convert(evaluate_df)
     train_ds = RayMLDataset.from_spark(
         train_df, self._num_workers, self._shuffle, None, fs_directory, compression)
     evaluate_ds = None
     if evaluate_df is not None:
         evaluate_ds = RayMLDataset.from_spark(
             evaluate_df, self._num_workers, self._shuffle, None, fs_directory, compression)
     return self.fit(
         train_ds, evaluate_ds, num_steps, profile, reduce_results, max_retries, info)
Example #3
0
def process_data():
    app_name = "NYC Taxi Fare Prediction with RayDP"
    num_executors = 1
    cores_per_executor = 1
    memory_per_executor = "500M"
    # Use RayDP to perform data processing
    spark = raydp.init_spark(app_name, num_executors, cores_per_executor,
                             memory_per_executor)
    data = spark.read.format("csv").option("header", "true") \
            .option("inferSchema", "true") \
            .load(NYC_TRAIN_CSV)
    # Set spark timezone for processing datetime
    spark.conf.set("spark.sql.session.timeZone", "UTC")
    data = nyc_taxi_preprocess(data)
    ds = RayMLDataset.from_spark(data, 1, args.batch_size)
    features = [
        field.name for field in list(data.schema)
        if field.name != "fare_amount"
    ]
    return ds.to_torch(feature_columns=features,
                       label_column="fare_amount"), len(features)
Example #4
0
num_executors = 1
cores_per_executor = 1
memory_per_executor = "500M"
spark = raydp.init_spark(app_name, num_executors, cores_per_executor,
                         memory_per_executor)
data = spark.read.format("csv").option("header", "true") \
        .option("inferSchema", "true") \
        .load(NYC_TRAIN_CSV)
# Set spark timezone for processing datetime
spark.conf.set("spark.sql.session.timeZone", "UTC")
# Transform the dataset
data = nyc_taxi_preprocess(data)
# Split data into train_dataset and test_dataset
train_df, test_df = random_split(data, [0.9, 0.1], 0)
# Convert spark dataframe into ML Dataset
train_dataset = RayMLDataset.from_spark(train_df, 2, 32)
test_dataset = RayMLDataset.from_spark(test_df, 2, 32)
# Then convert them into DMatrix used by xgboost
dtrain = RayDMatrix(train_dataset, label='fare_amount')
dtest = RayDMatrix(test_dataset, label='fare_amount')
# Configure the XGBoost model
config = {
    "tree_method": "hist",
    "eval_metric": ["logloss", "error"],
}
evals_result = {}
# Train the model
bst = train(config,
            dtrain,
            evals=[(dtest, "eval")],
            evals_result=evals_result,
Example #5
0
# Here we just use a subset of the training data
data = spark.read.format("csv").option("header", "true") \
        .option("inferSchema", "true") \
        .load(NYC_TRAIN_CSV)
# Set spark timezone for processing datetime
spark.conf.set("spark.sql.session.timeZone", "UTC")
# Transform the dataset
data = nyc_taxi_preprocess(data)
# Split data into train_dataset and test_dataset
train_df, test_df = random_split(data, [0.9, 0.1], 0)
features = [
    field.name for field in list(train_df.schema)
    if field.name != "fare_amount"
]
# Convert spark dataframe into ML Dataset
train_dataset = RayMLDataset.from_spark(train_df, num_executors, 32)
test_dataset = RayMLDataset.from_spark(test_df, num_executors, 32)
# Then convert to torch datasets
train_dataset = train_dataset.to_torch(feature_columns=features,
                                       label_column="fare_amount")
test_dataset = test_dataset.to_torch(feature_columns=features,
                                     label_column="fare_amount")


# Define a neural network model
class NYC_Model(nn.Module):
    def __init__(self, cols):
        super(NYC_Model, self).__init__()

        self.fc1 = nn.Linear(cols, 256)
        self.fc2 = nn.Linear(256, 128)