def train(): try: print("starting training...") hyperparameters = load_json_object(hyperparameters_file_path) print("\nHyperparameters configuration:") print_json_object(hyperparameters) input_data_config = load_json_object(inputdataconfig_file_path) print("\nInput data configuration:") print_json_object(input_data_config) for key in input_data_config: print("\nList of files in {0} channel: ".format(key)) channel_path = data_files_path + key + "/" print_files_in_path(channel_path) if os.path.exists(resource_file_path): resource_config = load_json_object(resource_file_path) print("\nResource configuration:") print_json_object(resource_config) # Take the set of files and read them all into a single pandas dataframe input_files = [ os.path.join(data_files_path + "train/", file) for file in os.listdir(data_files_path + "train/") ] if len(input_files) == 0: raise ValueError(( "There are no files in {}.\n" + "This usually indicates that the channel ({}) was incorrectly specified,\n" + "the data specification in S3 was incorrectly specified or the role specified\n" + "does not have permission to access the data.").format( data_files_path + "train/", "train")) concat_data = load_raw(input_files, [label_column, feature_column]) print(concat_data.info()) preprocessor = CountVectorizer(analyzer=set) print("fitting...") preprocessor.fit(concat_data[feature_column]) print("finished fitting...") feature_column_names = preprocessor.get_feature_names() print(feature_column_names) le = LabelEncoder() le.fit(concat_data[label_column]) print("le classes: ", le.classes_) dump(preprocessor, os.path.join(model_artifacts_path, "model.joblib")) dump(le, os.path.join(model_artifacts_path, "label.joblib")) print("saved model!") except Exception as e: write_failure_file(failure_file_path, str(e)) print(e, file=sys.stderr) sys.exit(1)
def main(args): """ SM_CHANNEL does not contain backward slash: SM_CHANNEL_TRAIN=/opt/ml/input/data/train SM_CHANNEL_VALIDATION=/opt/ml/input/data/validation Training job name: script-mode-container-xgb-2020-08-10-13-29-15-756 """ train_channel, validation_channel, model_dir = args.train, args.validation, args.model_dir print("\nList of files in train channel: ") print_files_in_path(train_channel) print("\nList of files in validation channel: ") print_files_in_path(validation_channel) use_cuda = torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") print("Device:", device) kwargs = {"num_workers": 8, "pin_memory": True} if use_cuda else {} input_features = 5 n_samples = 5000 dataset = MyDataset(n_samples, input_features, 3) train_len = int(n_samples * 0.7) test_len = n_samples - train_len train_set, val_set = torch.utils.data.random_split(dataset, [train_len, test_len]) train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = DataLoader(val_set, batch_size=args.batch_size, shuffle=True, **kwargs) model = Net(input_features).to(device) # optimizer = optim.Adadelta(model.parameters(), lr=args.lr) optimizer = optim.Adam(model.parameters(), lr=args.lr) # scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) # scheduler.step() test(model, device, test_loader) if args.save_model: save_model(model, model_dir)
def train(hp1, hp2, hp3, train_channel, validation_channel): print("\nList of files in train channel: ") print_files_in_path(os.environ["SM_CHANNEL_TRAIN"]) print("\nList of files in validation channel: ") print_files_in_path(os.environ["SM_CHANNEL_VALIDATION"]) # Dummy net. net = None # Run training loop. epochs = 5 for x in range(epochs): print("\nRunning epoch {0}...".format(x)) time.sleep(30) print("Completed epoch {0}.".format(x)) # At the end of the training loop, we have to save model artifacts. model_dir = os.environ["SM_MODEL_DIR"] save_model_artifacts(model_dir + "/", net)
def train(): try: print("\nRunning training...") if os.path.exists(hyperparameters_file_path): hyperparameters = load_json_object(hyperparameters_file_path) print('\nHyperparameters configuration:') print_json_object(hyperparameters) if os.path.exists(inputdataconfig_file_path): input_data_config = load_json_object(inputdataconfig_file_path) print('\nInput data configuration:') print_json_object(input_data_config) for key in input_data_config: print('\nList of files in {0} channel: '.format(key)) channel_path = data_files_path + key + '/' print_files_in_path(channel_path) if os.path.exists(resource_file_path): resource_config = load_json_object(resource_file_path) print('\nResource configuration:') print_json_object(resource_config) if (training_job_name_env in os.environ): print("\nTraining job name: ") print(os.environ[training_job_name_env]) if (training_job_arn_env in os.environ): print("\nTraining job ARN: ") print(os.environ[training_job_arn_env]) # This object is used to handle SIGTERM and SIGKILL signals. signal_handler = ExitSignalHandler() # Dummy net. net = None # Run training loop. epochs = 1 for x in range(epochs): print("\nRunning epoch {0}...".format(x)) time.sleep(10) if (signal_handler.exit_now): print( "Received SIGTERM/SIGINT. Saving training state and exiting." ) # Save state here. save_model_artifacts(model_artifacts_path, net) sys.exit(0) print("Completed epoch {0}.".format(x)) # At the end of the training loop, we have to save model artifacts. save_model_artifacts(model_artifacts_path, net) print("\nTraining completed!") except Exception as e: write_failure_file(failure_file_path, str(e)) print(e, file=sys.stderr) sys.exit(1)
def train(train_channel, validation_channel, model_dir, epochs): """ SM_CHANNEL does not contain backward slash: SM_CHANNEL_TRAIN=/opt/ml/input/data/train SM_CHANNEL_VALIDATION=/opt/ml/input/data/validation Training job name: script-mode-container-xgb-2020-08-10-13-29-15-756 """ print("\nList of files in train channel: ") print_files_in_path(train_channel) print("\nList of files in validation channel: ") print_files_in_path(validation_channel) X_train, X_test, y_train, y_test = get_data(train_channel, validation_channel) n_jobs = cpu_count() - 1 parameters = { "min_child_weight": 5, "max_depth": 5, "learning_rate": 0.0001, "objective": "multi:softprob", "n_estimators": epochs, } model = XGBClassifier( base_score=0.5, booster="gbtree", colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, max_delta_step=0, missing=None, n_jobs=n_jobs, # From version 1.1.1, cant use -1 for all cores nthread=None, random_state=0, reg_alpha=0, reg_lambda=1, # scale_pos_weight=1, subsample=1, verbosity=1, **parameters, ) print(model) fit_params = { # "sample_weight": df_train_w["sample_weight"], "early_stopping_rounds": 10, "eval_metric": "mlogloss", "eval_set": [(X_train, y_train), (X_test, y_test)], } model.fit(X_train, y_train, **fit_params) # model.fit(X_train, y_train) # Evaluation preds = model.predict(X_test) print(classification_report(y_test, preds)) print(confusion_matrix(y_test, preds, labels=[0, 1, 2])) print(precision_score(y_test, preds, average="weighted")) save_model(model, model_dir)