def test_split_data(self): input_file = 'data/test_questions.txt' x_train, y_train, x_test, y_test = split_data(0.8) self.assertNotEqual(x_train, {}) self.assertNotEqual(y_train, {}) self.assertNotEqual(x_test, {}) self.assertNotEqual(y_test, {})
def train_loop(exe, train_progm, init, num_iters, train_data, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict): data_input_names = encoder_data_input_fields + decoder_data_input_fields[: -1] + label_data_input_fields util_input_names = encoder_util_input_fields + decoder_util_input_fields start_time = time.time() exec_time = 0.0 for batch_id, data in enumerate(train_data()): if batch_id >= num_iters: break feed_list = [] total_num_token = 0 for place_id, data_buffer in enumerate( split_data(data, num_part=dev_count)): data_input_dict, util_input_dict, num_token = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token feed_kv_pairs = data_input_dict.items() + util_input_dict.items() lr_rate = lr_scheduler.update_learning_rate() feed_kv_pairs += {lr_scheduler.learning_rate.name: lr_rate}.items() feed_list.append(dict(feed_kv_pairs)) if not init: for pos_enc_param_name in pos_enc_param_names: pos_enc = position_encoding_init( ModelHyperParams.max_length + 1, ModelHyperParams.d_model) feed_list[place_id][pos_enc_param_name] = pos_enc for feed_dict in feed_list: feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token exe_start_time = time.time() if dev_count > 1: # prallel executor outs = exe.run(fetch_list=[sum_cost.name, token_num.name], feed=feed_list) else: # executor outs = exe.run(fetch_list=[sum_cost, token_num], feed=feed_list[0]) exec_time += time.time() - exe_start_time sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) total_sum_cost = sum_cost_val.sum() # sum the cost from multi-devices total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num print("batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % (batch_id, total_sum_cost, total_avg_cost, np.exp([min(total_avg_cost, 100)]))) init = True return time.time() - start_time, exec_time
def test_split(self): '''Testa a separacao dos dados em conjuntos de validacao e treinamento''' raw_data = load_data() pp_data = pre_processing(raw_data) (x_tr, y_tr, x_vl, y_vl) = split_data(pp_data) self.assertIsNotNone(x_tr) self.assertIsNotNone(x_vl) self.assertIsNotNone(y_tr) self.assertIsNotNone(y_vl) ratio_x = len(x_vl) / (len(x_vl) + len(x_tr)) ratio_y = len(y_vl) / (len(y_vl) + len(y_tr)) self.assertAlmostEqual(ratio_x, 0.2, places=1) self.assertAlmostEqual(ratio_y, 0.2, places=1)
def test_split_data(): test_data = { 'id': [0, 1, 2, 3, 4], 'target': [0, 0, 1, 0, 1], 'col1': [1, 2, 3, 4, 5], 'col2': [2, 1, 1, 2, 1] } data_df = pd.DataFrame(data=test_data) data = split_data(data_df) # verify that columns were removed correctly assert "target" not in data[0].data.columns assert "id" not in data[0].data.columns assert "col1" in data[0].data.columns # verify that data was split as desired assert data[0].data.shape == (4, 2) assert data[1].data.shape == (1, 2)
def test_split_data(): test_data = { 'id': [0, 1, 2, 3, 4], 'target': [0, 0, 1, 0, 1], 'col1': [1, 2, 3, 4, 5], 'col2': [2, 1, 1, 2, 1] } data_df = pd.DataFrame(data=test_data) data = split_data(data_df) # verify that columns were removed correctly assert "target" not in data[0].data.columns assert "id" not in data[0].data.columns assert "col1" in data[0].data.columns # verify that data was split as desired assert data[0].data.shape == (4, 2) assert data[1].data.shape == (1, 2) # the valid_data set's raw data is used for metric calculation, so # free_raw_data should be False assert not data[1].free_raw_data
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--model_name", type=str, help="Name of the Model", default="insure_model_model.pkl", ) parser.add_argument("--step_output", type=str, help=("output for passing data to next step")) parser.add_argument("--dataset_version", type=str, help=("dataset version")) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument( "--caller_run_id", type=str, help=("caller run id, for example ADF pipeline run id")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [caller_run_id]: %s" % args.caller_run_id) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() print("Getting training parameters") # Load the training parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: train_args = pars["training"] except KeyError: print("Could not load training values from file") train_args = {} # Log the training parameters print(f"Parameters: {train_args}") for (k, v) in train_args.items(): run.log(k, v) run.parent.log(k, v) # Get the dataset if (dataset_name): if (data_file_path == 'none'): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 else: dataset = register_dataset(run.experiment.workspace, dataset_name, os.environ.get("DATASTORE_NAME"), data_file_path) else: e = ("No dataset provided") print(e) raise Exception(e) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset run.parent.tag("dataset_id", value=dataset.id) # Split the data into test/train df = dataset.to_pandas_dataframe() data = split_data(df) # Train the model model = train_model(data, train_args) # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, data[1]) for (k, v) in metrics.items(): run.log(k, v) run.parent.log(k, v) # Pass model file to next step os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) joblib.dump(value=model, filename=model_output_path) # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) joblib.dump(value=model, filename=output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument("--model_name", type=str, help="Name of the Model") parser.add_argument("--step_output", type=str, help=("output for passing data to next step")) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) parser.add_argument("--datastore_name", type=str, help=("Datastore name.")) parser.add_argument( "--ml_params", type=str, help= "Parameters for ML pipelne in json format with defaults defined in parameters.json", # NOQA: E501 ) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [dataset_name]: %s" % args.dataset_name) print("Argument [datastore_name]: %s" % args.datastore_name) print("Argument [ml_params]: %s" % args.ml_params) model_name = args.model_name step_output_path = args.step_output data_file_path = args.data_file_path dataset_name = args.dataset_name datastore_name = args.datastore_name run = Run.get_context() training_args, preprocessing_args = parse_ml_params(run, args.ml_params) # Get the dataset dataset = get_or_register_dataset(dataset_name=dataset_name, datastore_name=datastore_name, data_file_path=data_file_path, aml_workspace=run.experiment.workspace) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset run.parent.tag("dataset_id", value=dataset.id) # Train the model # mount the dynamic version of the dataset, which can't be determined at pipeline publish time # NOQA: E501 mount_context = dataset.mount() mount_context.start() print(f"mount_point is: {mount_context.mount_point}") data = split_data(mount_context.mount_point, preprocessing_args) model, history = train_model(data, training_args, preprocessing_args) mount_context.stop() # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(history) for (k, v) in metrics.items(): run.log(k, v) run.parent.log(k, v) # Pass model file to next step os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) model.save(model_output_path) with open(os.path.join(step_output_path, "run_id.txt"), "w") as text_file: print(f"{run.id}", file=text_file) # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) model.save(output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
] if __name__ == "__main__": print('\nCARREGANDO DADOS...') data = load_data() start = time.time() min_max(data) for exam_type, name in [(0, 'IGG'), (1, 'IGM'), (2, 'PCR')]: print('\n------------------------\n') print('[REDE ' + name + '] SEPARANDO DADOS PARA TREINAMENTO...') processed_data = process_data(data, exam_type) print('[REDE ' + name + '] ADEQUANDO DADOS PARA TREINAMENTO...') splitted_data = split_data(processed_data) print('[REDE ' + name + '] MONTANDO MODELO...') x_train = splitted_data[0] y_train = splitted_data[2] print('[REDE ' + name + '] ' + str(len(x_train)) + \ ' ELEMENTOS NO CONJUNTO DE TREINAMENTO...') print('[REDE ' + name + '] ' + str(len(y_train)) + \ ' ELEMENTOS NO CONJUNTO DE VALIDAÇÃO...') print('[REDE ' + name + '] INSTANCIANDO REDES...') network = NN2(x_train.shape[1]) optimizer = torch.optim.Adam(network.parameters(), lr=LEARNING_RATE) criterion = torch.nn.BCELoss()
def main(): # test = os.environ.get("TEST_DATA") # train_data = os.environ.get("TRAINING_DATA") TRAINING_DATA_DIR = os.environ.get("TRAINING_DATA") TEST_DATA = os.environ.get("TEST_DATA") train_data = pd.read_csv(TRAINING_DATA_DIR) test = pd.read_csv(TEST_DATA) add_columns = train.addingColumns(train_data, test) data, country_dict, all_data = train.addingWolrd(add_columns) # le = preprocessing.LabelEncoder() # Select train (real) data from March 1 to March 22nd dates_list = [ '2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05', '2020-03-06', '2020-03-07', '2020-03-08', '2020-03-09', '2020-03-10', '2020-03-11', '2020-03-12', '2020-03-13', '2020-03-14', '2020-03-15', '2020-03-16', '2020-03-17', '2020-03-18', '2020-03-19', '2020-03-20', '2020-03-21', '2020-03-22', '2020-03-23', '2020-03-24' ] # Filter Spain, run the Linear Regression workflow # country_name = "Spain" country_name = "Spain" # country_name = os.environ.get("COUNTRY") day_start = 39 data_country = data[data['Country/Region'] == country_dict[country_name]] data_country = data_country.loc[data_country['Day_num'] >= day_start] X_train, Y_train_1, Y_train_2, X_test = train.split_data(data_country) model, pred = train.lin_reg(X_train, Y_train_1, X_test) # Create a df with both real cases and predictions (predictions starting on March 12th) X_train_check = X_train.copy() X_train_check['Target'] = Y_train_1 X_test_check = X_test.copy() X_test_check['Target'] = pred X_final_check = pd.concat([X_train_check, X_test_check]) # Select predictions from March 1st to March 24th predicted_data = X_final_check.loc[(X_final_check['Day_num'].isin( list(range(day_start, day_start + len(dates_list)))))].Target real_data = train_data.loc[ (train_data['Country/Region'] == country_name) & (train_data['Date'].isin(dates_list))]['ConfirmedCases'] dates_list_num = list(range(0, len(dates_list))) # Plot results fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) ax1.plot(dates_list_num, np.exp(predicted_data)) ax1.plot(dates_list_num, real_data) ax1.axvline(10, linewidth=2, ls=':', color='grey', alpha=0.5) ax1.legend(['Predicted cases', 'Actual cases', 'Train-test split'], loc='upper left') ax1.set_xlabel("Day count (from March 1st to March 22nd)") ax1.set_ylabel("Confirmed Cases") ax2.plot(dates_list_num, predicted_data) ax2.plot(dates_list_num, np.log(real_data)) ax2.axvline(10, linewidth=2, ls=':', color='grey', alpha=0.5) ax2.legend(['Predicted cases', 'Actual cases', 'Train-test split'], loc='upper left') ax2.set_xlabel("Day count (from March 1st to March 22nd)") ax2.set_ylabel("Log Confirmed Cases") plt.suptitle( ("ConfirmedCases predictions based on Linear Regression for " + country_name)) plt.show()
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--model_name", type=str, help="Name of the Model", default="insurance_model.pkl", ) parser.add_argument( "--data_file_path", type=str, help= ("data file path, if specified,a new version of the dataset will be registered" ), default="insurance", ) parser.add_argument( "--dataset_name", type=str, help="Dataset name", default="insurance_dataset", ) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() print("Getting training parameters") # Load the training parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: train_args = pars["training"] except KeyError: print("Could not load training values from file") train_args = {} # Log the training parameters print(f"Parameters: {train_args}") for (k, v) in train_args.items(): run.log(k, v) #run.parent.log(k, v) # Get the dataset if (dataset_name): if (data_file_path == 'none'): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name) # NOQA: E402, E501 else: dataset = register_dataset(run.experiment.workspace, dataset_name, "workspaceblobstore", data_file_path) else: e = ("No dataset provided") print(e) raise Exception(e) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset #run.parent.tag("dataset_id", value=dataset.id) # Split the data into test/train df = dataset.to_pandas_dataframe() data = split_data(df) # Train the model model = train_model(data, train_args) # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, data) for (k, v) in metrics.items(): run.log(k, v) #run.parent.log(k, v) # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) joblib.dump(value=model, filename=output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") # upload the model file explicitly into artifacts print("Uploading the model into run artifacts...") run.upload_file(name="./outputs/models/" + model_name, path_or_stream=output_path) print("Uploaded the model {} to experiment {}".format( model_name, run.experiment.name)) dirpath = os.getcwd() print(dirpath) print("Following files are uploaded ") print(run.get_file_names()) run.complete()