class CreateClassifier(object): def __init__(self): self.pre = Preprocess() self.nlp = NLPHelper() self.fex = FeatureExtractor() self.ut = Utility() self.mt = ModelTrainer() def createClassifier(self): # get golden data # data = self.nlp.getGoldenDataset() # extract entity and save into pickle # self.nlp.extractNews(data) #CHANGE MODULE WHEN SWITCHING BETWEEN ADDITIONAL IDN AND DEFAULT # self.nlp.core_nlp.close() # # find feature in one text and save it to excel # # scenario 1 # # path = "scenario1_halfidn_pickle/" # # scenario 2 # # path = "scenario2_fullidn_pickle/" # # scenario 3 # path = "scenario3_stanford_pickle/" # path = "test/" # filelist = os.listdir(path) # data = pd.DataFrame() # for idx, file in enumerate(filelist): # #buka file pickle yang isinya data ner, coref, dan pos dari suatu teks berita # pkl_dict = self.ut.loadPickle(os.path.join(path, file)) # # ekstraksi fitur dari file pickle # temp = self.fex.extractFeaturesFromPickle(pkl_dict) # data = data.append(temp) # #scenario 1 # self.ut.convertToExcel("scenario1_idnnerhalf_extracted_feature.xlsx",data,'Sheet1') # #scenario 2 # self.ut.convertToExcel("scenario2_idnnerfull_extracted_feature.xlsx",data,'Sheet1') # #scenario 3 # self.ut.convertToExcel("scenario3_stanford_extracted_feature.xlsx",data,'Sheet1') # #scenario testing # self.ut.convertToExcel("testing_rf.xlsx",data,'Sheet1') # for training use # reading excel that contain features (HARUS DIKASIH KOLOM WHO DAN WHERE DULU, DAN DITENTUKAN YANG MANA WHO DAN WHERE) # scenario 1 # df = pd.read_excel('scenario1_idnnerhalf_extracted_feature.xlsx', sheet_name='Sheet1') # scenario 2 df = pd.read_excel('scenario2_idnnerfull_extracted_feature.xlsx', sheet_name='Sheet1') # # scenario 3 # df = pd.read_excel('scenario3_stanford_extracted_feature.xlsx', sheet_name='Sheet1') # # training model for detecting who and where, input "where" or "who" meaning that column will be dropped (deleted) who = self.mt.train(df, 'where') where = self.mt.train(df, 'who') self.nlp.core_nlp.close()
def train_and_eval(embedding, layers, batch_size, layers_type): # Device device = get_device() # Training parameters epochs = 5 # Train and dev data train_file = './data/snli_1.0_train.jsonl' train_data = Data(train_file, embedding) dev_file = './data/snli_1.0_dev.jsonl' dev_data = Data(dev_file, embedding) test_file = './data/snli_1.0_test.jsonl' test_data = Data(test_file, embedding) # Create the model model = ResidualLSTMEncoder(embedding_vectors=embedding.vectors, padding_index=train_data.padding_index, layers_def=layers, output_size=len(train_data.c2i), max_sentence_length=Data.MAX_SENTENCE_SIZE, hidden_mlp=800, device=device, layers_type=layers_type) num_of_params = sum(p.numel() for p in model.parameters()) print("Number of model parameters: %d" % num_of_params) model = model.to(device) # Create optimizer optimizer = optim.Adam(model.parameters(), lr=2e-4) # optimizer = optim.Adagrad(model.parameters()) # Create a model trainer object model_trainer = ModelTrainer(net=model, device=device, optimizer=optimizer) # Train the model model_trainer.train(train_data, dev_data, train_log_file='train_1.txt', dev_log_file='dev_1.txt', epochs=epochs, batch_size=batch_size) # Save the model model_trainer.save_model('./models/model_1') # Test the model test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=0) test_performencer = Performencer(name='Test', output_size=model.output_size) model_trainer.eval(test_loader, test_performencer) test_performencer.pinpoint() test_performencer.log_to_file('test_1.txt')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data', type=str, help='Data file path', required=True) parser.add_argument('--output', type=str, help='Output file path', required=True) parser.add_argument('--output_model', type=str, help='Model path', default=None) parser.add_argument('--level', type=int, default=0) parser.add_argument('--fold', type=int, default=2) parser.add_argument('--iter', type=int, default=1) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--epoch', type=int, default=30) parser.add_argument('--random_state', type=int, default=None) args = parser.parse_args() dataset = BrainDataset(args.data, expand_dim=True, level=args.level) model = CNN1D(len(np.unique(dataset.label))).to(DEVICE) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) epochs = args.epoch batch_size = args.batch_size trainer = ModelTrainer(model, dataset, DEVICE) result = trainer.train(optimizer, criterion, batch_size=batch_size, epochs=epochs, kfold=args.fold, iteration=args.iter, random_state=args.random_state) result = np.array(result) np.savetxt(args.output, result, delimiter=",") if args.output_model is not None: torch.save(model.state_dict(), args.output_model)
def train_small_test_version(self, hyperparams_dict): """perform training on small test data""" trainer = ModelTrainer(self.dataloaders, hyperparams_dict, self.wv_wrapper, self.path) model, losses, accuracies = trainer.train(epochs=3) return model, losses, accuracies
elif opt.model_name == 2: # MLP train_config['using_spectrogram'] = True train_config['criterion'] = 'MSE' elif opt.model_name == 3: # simple generator train_config['using_simple_g'] = True train_config['criterion'] = 'MSE' elif opt.model_name == 4: # 1D auto encoder train_config['criterion'] = 'MSE' elif opt.model_name == 5: # 2D auto encoder train_config['criterion'] = 'MSE' train_config['using_spectrogram'] = True elif opt.model_name == 6: # simple auto encoder train_config['criterion'] = 'MSE' elif opt.model_name == 7: # adversarial MLP train_config['using_spectrogram'] = True trainer = ModelTrainer(**train_config) trainer.train()
def main(): logging.warning("dummy warning!!!") logging.error("dummy error!!!") logging.info("dummy info!!!") logging.debug("dummy debug!!!") logging.warning(f"Inside {__file__}") parser = argparse.ArgumentParser() parser.add_argument("--subscription_id", type=str, dest="subscription_id", help="The Azure subscription ID") parser.add_argument("--resource_group", type=str, dest="resource_group", help="The resource group name") parser.add_argument("--workspace_name", type=str, dest="workspace_name", help="The workspace name") parser.add_argument("--experiments_config_filepath", type=str, dest="experiments_config_filepath", help="A path to the JSON config file") # noqa: E501 parser.add_argument("--model_name", type=str, dest="model_name", help="Name of the Model") parser.add_argument("--should_register_model", type=str2bool, dest="should_register_model", default=False, help="Register trained model") # noqa: E501 args = parser.parse_args() logging.warning(f"Argument 1: {args.subscription_id}") logging.warning(f"Argument 2: {args.resource_group}") logging.warning(f"Argument 3: {args.workspace_name}") logging.warning(f"Argument 4: {args.experiments_config_filepath}") logging.warning(f"Argument 5: {args.model_name}") logging.warning(f"Argument 6: {args.should_register_model}") # Get current service context run = Run.get_context() workspace = run.experiment.workspace # Load training configuration experiment_configuration = ExperimentConfigurationWrapper() experiment_configuration.load(args.experiments_config_filepath) training_config = experiment_configuration.json["feature_extractor"][ "training"] # initialize empty collections for data # train_set = [] # test_set = [] # dev_set = [] download_root_dir = os.path.join('/mnt', 'tmp', 'datasets') data_splitter = HDF5TrainTestSplitter() for data_config in training_config["data"]: cropped_cells_dataset_name = data_config['input'][ 'cropped_cells_dataset_name'] cropped_cells_dataset_version = data_config['input'][ 'cropped_cells_dataset_version'] cropped_cells_dataset = Dataset.get_by_name( workspace=workspace, name=cropped_cells_dataset_name, version=cropped_cells_dataset_version) msg = ( f"Dataset '{cropped_cells_dataset_name}', id: {cropped_cells_dataset.id}" f", version: {cropped_cells_dataset.version} will be used to prepare data for a feature extractor training." ) logging.warning(msg) # Create a folder where datasets will be downloaded to dataset_target_path = os.path.join(download_root_dir, cropped_cells_dataset_name) os.makedirs(dataset_target_path, exist_ok=True) # Download 'cropped cells' dataset (consisting of HDF5 and CSV files) dataset_target_path = download_registered_file_dataset( workspace, cropped_cells_dataset, download_root_dir) list_all_files_in_location(dataset_target_path) # Split data (indices) into subsets df_metadata = pd.read_csv( os.path.join(dataset_target_path, 'cropped_nuclei.csv')) logging.warning(f"Metadata dataframe (shape): {df_metadata.shape}") logging.warning("Splitting data into subsets...") data_splitter.add_dataset( name=data_config['input']['cropped_cells_dataset_name'], fname=os.path.join(dataset_target_path, 'cropped_nuclei_images.h5'), metadata=df_metadata) data_splitter.train_dev_test_split() # -------- # Training # -------- # Init dataloaders #train_dataset = CellDataset(cell_list=train_set, target_cell_shape=INPUT_IMAGE_SIZE) train_dataset = CellDataset(splitter=data_splitter, dset_type='train') train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=DATA_LOADER_WORKERS, ) #dev_dataset = CellDataset(cell_list=dev_set, target_cell_shape=INPUT_IMAGE_SIZE) dev_dataset = CellDataset(splitter=data_splitter, dset_type='dev') dev_data_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=DATA_LOADER_WORKERS, ) #test_dataset = CellDataset(cell_list=test_set, target_cell_shape=INPUT_IMAGE_SIZE) test_dataset = CellDataset(splitter=data_splitter, dset_type='test') test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=DATA_LOADER_WORKERS, ) # Define and Train model device = torch.device(DEVICE) model = AUTOENCODER( latent_dim_size=LATENT_DIM_SIZE, input_image_size=INPUT_IMAGE_SIZE, device=device, ) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) print(f"Using {torch.cuda.device_count()} GPUs for training") # model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3]) trainer = ModelTrainer(model, device) tr_losses, dev_losses = trainer.train( epochs=EPOCHS, optimizer=optimizer, train_data_loader=train_data_loader, dev_data_loader=dev_data_loader, ) test_loss = trainer.test_model(test_data_loader) run.log("dev_loss", np.max(dev_losses)) run.log("train_loss", np.max(tr_losses)) run.log("test_loss", test_loss) # Plot training metrics and model sample reconstructions trainer.get_training_plot(tr_losses=tr_losses, dev_losses=dev_losses) run.log_image("model training metrics", plot=plt) dataiter = iter(test_data_loader) images = dataiter.next() trainer.get_pred_samples(images, figsize=(40, 40)) run.log_image("sample reconstructions", plot=plt) # Training completed! Let's save the model and upload it to AML os.makedirs("./models", exist_ok=True) model_file_name = "model.ext" model_output_loc = os.path.join(".", "models", model_file_name) torch.save(model, model_output_loc) run.upload_files(names=[model_output_loc], paths=[model_output_loc]) # Register model (ideally, this should be a separate step) if args.should_register_model: logging.warning("List of the associated stored files:") logging.warning(run.get_file_names()) logging.warning("Registering a new model...") # TODO: prepare a list of metrics that were logged using run.log() metric_names = [] if os.path.exists(model_output_loc): register_model( run=run, model_name=args.model_name, model_description="Feature extraction model", model_path=model_output_loc, training_context="PythonScript", metric_names=metric_names, ) else: logging.warning( f"Cannot register model as path {model_output_loc} does not exist." ) else: logging.warning("A trained model will not be registered.") logging.warning("Done!") logging.info("Done Info Style!")
def main(args): total_step = 100//args.EF # set random seed np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) random.seed(args.seed) # prepare checkpoints and log folders if not os.path.exists(args.checkpoints_dir): os.makedirs(args.checkpoints_dir) if not os.path.exists(args.logs_dir): os.makedirs(args.logs_dir) # initialize dataset if args.dataset == 'visda': args.data_dir = os.path.join(args.data_dir, 'visda') data = Visda_Dataset(root=args.data_dir, partition='train', label_flag=None) elif args.dataset == 'office': args.data_dir = os.path.join(args.data_dir, 'Office') data = Office_Dataset(root=args.data_dir, partition='train', label_flag=None, source=args.source_name, target=args.target_name) elif args.dataset == 'home': args.data_dir = os.path.join(args.data_dir, 'OfficeHome') data = Home_Dataset(root=args.data_dir, partition='train', label_flag=None, source=args.source_name, target=args.target_name) elif args.dataset == 'visda18': args.data_dir = os.path.join(args.data_dir, 'visda18') data = Visda18_Dataset(root=args.data_dir, partition='train', label_flag=None) else: print('Unknown dataset!') args.class_name = data.class_name args.num_class = data.num_class args.alpha = data.alpha # setting experiment name label_flag = None selected_idx = None args.experiment = set_exp_name(args) logger = Logger(args) if not args.visualization: for step in range(total_step): print("This is {}-th step with EF={}%".format(step, args.EF)) trainer = ModelTrainer(args=args, data=data, step=step, label_flag=label_flag, v=selected_idx, logger=logger) # train the model args.log_epoch = 4 + step//2 trainer.train(step, epochs= 4 + (step) * 2, step_size=args.log_epoch) # pseudo_label pred_y, pred_score, pred_acc = trainer.estimate_label() # select data from target to source selected_idx = trainer.select_top_data(pred_score) # add new data label_flag, data = trainer.generate_new_train_data(selected_idx, pred_y, pred_acc) else: # load trained weights trainer = ModelTrainer(args=args, data=data) trainer.load_model_weight(args.checkpoint_path) vgg_feat, node_feat, target_labels, split = trainer.extract_feature() visualize_TSNE(node_feat, target_labels, args.num_class, args, split) plt.savefig('./node_tsne.png', dpi=300)
def main(args): # Modified here total_step = 100 // args.EF # set random seed np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) random.seed(args.seed) # prepare checkpoints and log folders if not os.path.exists(args.checkpoints_dir): os.makedirs(args.checkpoints_dir) if not os.path.exists(args.logs_dir): os.makedirs(args.logs_dir) # initialize dataset if args.dataset == 'nusimg': args.data_dir = os.path.join(args.data_dir, 'visda') data = NUSIMG_Dataset(root=args.data_dir, partition='train', label_flag=None, source=args.source_path, target=args.target_path) elif args.dataset == 'office': args.data_dir = os.path.join(args.data_dir, 'Office') data = Office_Dataset(root=args.data_dir, partition='train', label_flag=None, source=args.source_path, target=args.target_path) elif args.dataset == 'mrc': data = MRC_Dataset(root=args.data_dir, partition='train', label_flag=None, source=args.source_path, target=args.target_path) else: print('Unknown dataset!') args.class_name = data.class_name args.num_class = data.num_class args.alpha = data.alpha # setting experiment name label_flag = None selected_idx = None args.experiment = set_exp_name(args) logger = Logger(args) trainer = ModelTrainer(args=args, data=data, label_flag=label_flag, v=selected_idx, logger=logger) for step in range(total_step): print("This is {}-th step with EF={}%".format(step, args.EF)) # train the model args.log_epoch = 5 trainer.train(epochs=24, step=step) #24 # psedo_label pred_y, pred_score, pred_acc = trainer.estimate_label() # select data from target to source selected_idx = trainer.select_top_data(pred_score) # add new data trainer.generate_new_train_data(selected_idx, pred_y, pred_acc)
def train(): model_trainer = ModelTrainer(parsed_args) model_trainer.train()
if torch.cuda.is_available(): device = torch.device(args.device) else: device = torch.device('cpu') optimizer = torch.optim.Adam(network.parameters(), lr=args.learning_rate, weight_decay=1e-2) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_schedule, gamma=lr_gamma) # ## Debug # torch.autograd.set_detect_anomaly(True) trainer = ModelTrainer(model=network, train_loader=trainloader, test_loader=testloader, optimizer=optimizer, scheduler=scheduler, criterion=loss_fn, epochs=args.epochs, name=args.name, test_freq=args.val_freq, device=args.device) # train best_model = trainer.train() # #Test best model # trainer.test_best_model(best_model, fname_suffix='_posttraining')