def main(): # Set the log file for debuging use utils.set_logger(os.path.join(os.getcwd(), 'train.log')) logging.info('Loading datasets...') data_loader = DataLoader(DATA_PATH) X_train, Y_train, X_val, Y_val = data_loader.get_train_data() X_test, Y_test = data_loader.get_test_data() logging.info('Building the model...') my_model = seq2class() # NEED TO PASS PARAMETERS SHIT print("Here is our model: ") print(my_model.model.summary()) logging.info('Training....') history = my_model.model.fit(X_train, Y_train, epochs=EPOCHS, verbose=1, batch_size=BATCH_SIZE, validation_data=(X_val, Y_val)) logging.info(f"train loss: {history.history['loss']}") logging.info(f"val loss: {history.history['val_loss']}") logging.info(f"train accuracy: {history.history['acc']}") logging.info(f"val accuracy: {history.history['val_acc']}") # Plotting the loss history # plot = utils.Plotting(history) # plot.plot_loss() # plot.plot_accuracy() print('Testing...') loss, accuracy = my_model.model.evaluate(X_test, Y_test) logging.info('Testing loss', loss) logging.info("Test accuracy", accuracy)
def evaluate_from_workspace(workspace_dir): global args, data_loader """ Evaluate the model on the test set. """ data_dir = workspace_dir model_dir = os.path.join(data_dir, "model") # Load the parameters args = parser.parse_args() json_path = os.path.join(model_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = utils.Params(json_path) params.data_dir = data_dir if data_dir else args.data_dir params.model_dir = model_dir if model_dir else args.model_dir # use GPU if available params.cuda = torch.cuda.is_available() # use GPU is available # Set the random seed for reproducible experiments torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Get the logger utils.set_logger(os.path.join(params.model_dir, 'evaluate.log')) # Create the input data pipeline logging.info("Creating the dataset...") # load data data_loader = DataLoader(params.data_dir, params) data = data_loader.load_data_from_dir(['test'], params.data_dir) test_data = data['test'] # specify the test set size params.test_size = test_data['size'] test_data_iterator = data_loader.data_iterator(test_data, params) logging.info("- done.") # Define the model model = net.Net(params).cuda() if params.cuda else net.Net(params) loss_fn = net.loss_fn metrics = net.metrics logging.info("Starting evaluation") # Reload weights from the saved file utils.load_checkpoint( os.path.join(params.model_dir, args.restore_file + '.pth.tar'), model) # Evaluate num_steps = (params.test_size + 1) // params.batch_size test_metrics = evaluate(model, loss_fn, test_data_iterator, metrics, params, num_steps) save_path = os.path.join(params.model_dir, "metrics_test_{}.json".format(args.restore_file)) utils.save_dict_to_json(test_metrics, save_path)
def setUp(self): self.files = list(Path("../data").iterdir()) self.files = sorted( self.files, key=lambda x: int( str(x).split("/")[-1].split(".")[0].split("_")[-1])) self.dataloader = DataLoader(files=self.files)
from torch import device as dev MODEL_DIR = 'experiments/base_model/' DATA_DIR = 'data/' params = utils.Params(MODEL_DIR + 'params.json') params.vocab_size = 25 params.number_of_classes = 10 params.cuda = torch.cuda.is_available() weights = MODEL_DIR + 'best.pth' model = net.Net(params).cuda() if params.cuda else net.Net(params) checkpoint = torch.load(weights, map_location=dev('cpu')) model.load_state_dict(checkpoint['state_dict']) data_loader = DataLoader(DATA_DIR, params) data = data_loader.load_data(['train', 'val'], DATA_DIR) train_data = data['train'] train_data_iterator = data_loader.data_iterator(train_data, params, shuffle=True) train_batch, _ = next(train_data_iterator) val_data = data['val'] val_data_iterator = data_loader.data_iterator(val_data, params, shuffle=False) val_batch, _ = next(val_data_iterator) explainer = shap.KernelExplainer(model.forward, train_batch[:1]) vals = train_batch[:10] shap_values = explainer.shap_values(train_batch[:10]) shap.force_plot(explainer.expected_value[0], shap_values[0][0],
import test_context import re from model.data_loader import DataLoader dl = DataLoader('data/xml/CoreOnly.xml') for mon in dl.monsters: for atype in [mon.traits, mon.actions, mon.reactions, mon.legendaries]: for act in atype: if type(act.text) is list: t = '' for at in act.text: if at is not None: t += at elif act.text is not None: t = act.text if t: m = re.search(r'[0-9]+d[0-9]+', t) if m and act.attack is None: print('{}: {}'.format(mon.name, act.name))
# use GPU if available params.cuda = torch.cuda.is_available() # Set the random seed for reproducible experiments # torch.manual_seed(230) # if params.cuda: # torch.cuda.manual_seed(230) # Set the logger utils.set_logger(os.path.join(args.model_dir, 'train.log')) # Create the input data pipeline logging.info("Loading the datasets...") # load data data_loader = DataLoader(params) data = data_loader.load_data(['train', 'dev']) train_data = data['train'] val_data = data['dev'] # specify the train and val dataset sizes params.train_size = train_data['size'] params.val_size = val_data['size'] logging.info("- done.") # Define the model and optimizer model = net.Net(params).cuda() if params.cuda else net.Net(params) optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) # fetch loss function and metrics
# use GPU if available params.cuda = torch.cuda.is_available() # Set the random seed for reproducible experiments # torch.manual_seed(230) # if params.cuda: torch.cuda.manual_seed(230) # Set the logger utils.set_logger(os.path.join(args.model_dir, 'train.log')) # Create the input data pipeline logging.info("Loading the datasets...") # load data data_loader = DataLoader(params) train_data_path = os.path.join(args.data_dir, 'train', 'train_data.json') data_loader.load_data(train_data_path, split='train', size_limit=args.dataset_size_limit) # data_loader.split_data(split_ratio=params.split_ratio) val_data_path = os.path.join(args.data_dir, 'val', 'val_data.json') data_loader.load_data(val_data_path, 'val', size_limit=args.dataset_size_limit) # specify the train and val dataset sizes( append in data_loader params.train_size = data_loader.get_dataset_size('train') params.val_size = data_loader.get_dataset_size('val') logging.info("- done.")
params = utils.Params(json_path) params.cuda = torch.cuda.is_available() torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Set the logger utils.set_logger(os.path.join(args.model_dir, 'train.log')) # Create the input data pipeline logging.info("Loading the datasets...") # load data data_loader = DataLoader('data/', params) data = data_loader.load_data(['train', 'val'], 'data/') train_data = data['train'] val_data = data['val'] # specify the train and val dataset sizes params.train_size = train_data['size'] params.val_size = val_data['size'] logging.info('- done.') # Define the model and optimizer model = net.Net(params).cuda() if params.cuda else net.Net(params) optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) # fetch loss function and metrics
params = utils.Params(json_path) # In[3]: # use GPU if available params.cuda = torch.cuda.is_available() params.dict # In[4]: # load data data_loader = DataLoader(data_dir, params) data = data_loader.load_data(['train', 'val', 'test']) train_data = data['train'] val_data = data['val'] test_data = data['test'] # specify the train and val dataset sizes params.train_size = train_data['size'] params.val_size = val_data['size'] params.test_size = test_data['size'] params.pad_tag_ind = data_loader.tag_map[params.pad_tag] # In[5]:
# use GPU if available params.cuda = torch.cuda.is_available() # use GPU is available # Set the random seed for reproducible experiments torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Get the logger utils.set_logger(os.path.join(args.model_dir, 'evaluate.log')) # Create the input data pipeline logging.info("Creating the dataset...") # load data data_loader = DataLoader('data/', params) data = data_loader.load_data(['test'], 'data/') test_data = data['test'] # specify the test set size params.test_size = test_data['size'] test_data_iterator = data_loader.data_iterator(test_data, params) logging.info("- done.") # Define the model model = net.Net(params).cuda() if params.cuda else net.Net(params) loss_fn = net.loss_fn metrics = net.metrics
params.number_of_classes = 10 params.cuda = torch.cuda.is_available() # weights = MODEL_DIR + 'best.pth' classes = [ 'Extracellular', 'Plastid', 'Cytoplasm', 'Mitochondrion', 'Nucleus', 'Endoplasmic.reticulum', 'Golgi.apparatus', 'Cell.membrane', 'Lysosome/Vacuole', 'Peroxisome' ] # model = net.Net(params).cuda() if params.cuda else net.Net(params) # checkpoint = torch.load(weights, map_location=dev('cpu')) # model.load_state_dict(checkpoint['state_dict']) # model.eval() loader = DataLoader(DATA_DIR, params) # data = loader.load_data(['train', 'val'], DATA_DIR) # train_data = data['train'] # train_data_iterator = loader.data_iterator(train_data, params, shuffle=False) # train_batch, label_batch = next(train_data_iterator) # sentences_file = DATA_DIR + 'train/sentences.txt' # labels_file = DATA_DIR + 'train/labels.txt' # sentences = [] # with open(sentences_file) as f: # for sent in f.read().splitlines(): # sentences.append(sent) # labels = [] # with open(labels_file) as f: # for lab in f.read().splitlines(): # labels.append(lab) token_reference = TokenReferenceBase(reference_token_idx=loader.pad_ind)
@staticmethod def exact_match_score(outputs, data_batch): preds = outputs_to_preds(outputs) preds = preds_to_answers(preds, data_batch['tkd_c']) scores = [ metric_max_over_ground_truths(exact_match_score, pred, data_batch['gts'][i]) for i, pred in enumerate(preds) ] return np.mean(scores) @staticmethod def f1_score(outputs, data_batch): preds = outputs_to_preds(outputs) preds = preds_to_answers(preds, data_batch['tkd_c']) scores = [ metric_max_over_ground_truths(f1_score, pred, data_batch['gts'][i]) for i, pred in enumerate(preds) ] return np.mean(scores) if __name__ == '__main__': from utils.model_utils import Params params = Params('../model_dir/configs.json') params.update('../data/dataset_configs.json') from model.data_loader import DataLoader data_loader = DataLoader(params) _model = Model(params) print(_model)
job_name = "emb{}_lr{}_k{}_bs{}_f{}_dr{}_{}_{}".format( params.emb, params.learning_rate, params.kernels, params.batch_size, params.filters, params.dropout, params.model, params.task) # Create a new folder in parent_dir with unique_name "job_name" model_dir = os.path.join(params.save_path, 'fold' + str(params.fold), job_name) if not os.path.exists(model_dir): os.makedirs(model_dir) # Set the logger utils.set_logger(os.path.join(model_dir, 'train.log')) logging.info("Loading the datasets...") # load data data_loader = DataLoader(params.local_data, params) data = data_loader.load_data(['train', 'val', 'test'], params.local_data) train_data = data['train'] val_data = data['val'] test_data = data['test'] # specify the train and val dataset sizes params.train_size = train_data['size'] params.val_size = val_data['size'] logging.info("- done.") # Define the model and optimizer device = torch.device( "cuda:0" if params.cuda else sys.exit("gpu unavailable"))
# Set the logger job_name = "emb{}_lr{}_k{}_bs{}_f{}_dr{}_{}_{}".format( params.emb, params.learning_rate, params.kernels, params.batch_size, params.filters, params.dropout, params.model, params.task) # Create a new folder in parent_dir with unique_name "job_name" model_dir = os.path.join(params.save_path, 'fold' + str(params.fold), job_name) if not os.path.exists(model_dir): os.makedirs(model_dir) utils.set_logger(os.path.join(model_dir, 'eval.log')) logging.info("Loading the datasets...") # load data data_loader = DataLoader(params.data_dir, params) data = data_loader.load_data(['test'], params.data_dir) test_data = data['test'] # specify the train and val dataset sizes params.test_size = test_data['size'] logging.info("- done.") # Define the model and optimizer device = torch.device( "cuda:0" if params.cuda else sys.exit("gpu unavailable")) if params.model == "cnn_text": if params.emb == 'w2v': model = models.CNN_Text(data_loader.weights_w2v, params)
def predict_from_workspace(workspace_dir, input_data): """ Evaluate the model on the test set. """ global args, data_loader data_dir = workspace_dir model_dir = os.path.join(data_dir, "model") # Load the parameters args = parser.parse_args() trgt_json_path = os.path.join(model_dir, 'params.json') assert os.path.isfile( trgt_json_path), "No json configuration file found at {}".format( trgt_json_path) params = utils.Params(trgt_json_path) params.data_dir = data_dir if data_dir else args.data_dir params.model_dir = model_dir if model_dir else args.model_dir # use GPU if available params.cuda = torch.cuda.is_available() # use GPU is available # Set the random seed for reproducible experiments torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Get the logger utils.set_logger(os.path.join(params.model_dir, 'evaluate.log')) # Create the input data pipeline logging.info("Creating the dataset...") # load data data_loader = DataLoader(params.data_dir, params) data = data_loader.load_data_for_predict(input_data) batch_sentences = data["predict"]["data"] # compute length of longest sentence in batch batch_max_len = max([len(s) for s in batch_sentences]) # prepare a numpy array with the data, initialising the data with pad_ind and all labels with -1 # initialising labels to -1 differentiates tokens with tags from PADding tokens batch_data = data_loader.pad_ind * np.ones( (len(batch_sentences), batch_max_len)) # copy the data to the numpy array for j in range(len(batch_sentences)): cur_len = len(batch_sentences[j]) batch_data[j][:cur_len] = batch_sentences[j] logging.info("- done.") # Define the model model = net.Net(params).cuda() if params.cuda else net.Net(params) logging.info("Starting prediction") # Reload weights from the saved file utils.load_checkpoint( os.path.join(params.model_dir, args.restore_file + '.pth.tar'), model) # Evaluate results = predict(model, batch_data) return results
def train_from_workspace(workspace_dir): global args, data_loader data_dir = workspace_dir model_dir = os.path.join(data_dir, "model") # Load the parameters from json file args = parser.parse_args() src_json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile(src_json_path), "No json configuration file found at {}".format(src_json_path) trgt_json_path = os.path.join(model_dir, 'params.json') if not os.path.exists(model_dir): print("Workspace Model Directory does not exist! Making directory {}".format(model_dir)) os.mkdir(model_dir) else: print("Workspace Model Directory exists! ") shutil.copyfile(src_json_path, trgt_json_path) params = utils.Params(trgt_json_path) params.data_dir = data_dir if data_dir else args.data_dir params.model_dir = model_dir if model_dir else args.model_dir # use GPU if available params.cuda = torch.cuda.is_available() # Set the random seed for reproducible experiments torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Set the logger utils.set_logger(os.path.join(params.model_dir, 'train.log')) # Create the input data pipeline logging.info("Loading the datasets...") # load data data_loader = DataLoader(params.data_dir, params) data = data_loader.load_data_from_dir(['train', 'val'], params.data_dir) train_data = data['train'] val_data = data['val'] # specify the train and val dataset sizes params.train_size = train_data['size'] params.val_size = val_data['size'] logging.info("- done.") # Define the model and optimizer model = net.Net(params).cuda() if params.cuda else net.Net(params) optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) # fetch loss function and metrics loss_fn = net.loss_fn metrics = net.metrics # Train the model logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) best_eval_acc = train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, params.model_dir, args.restore_file) return best_eval_acc
from model.analize import Analize from model.data_loader import DataLoader import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import itertools as it from pandas import DataFrame X_trained = ['Milk', 'Frozen'] label = ['Customer'] from model.preprocess import Preprocess data = DataLoader('./data.csv', ';', ',') file = data.read_file() prepared = Preprocess().prepare_input(X_trained, file) model = Analize().fit(prepared) rs = file['Milk'] rs = rs.to_frame(name='Milk') rs2 = file['Frozen'] rs2 = rs2.to_frame(name='Frozen') lab = file['Customer '] lab = lab.to_frame(name='Customer') rs3 = pd.DataFrame(model) rs3.columns = ['Cluster']
# use GPU if available params.cuda = torch.cuda.is_available() # use GPU is available # Set the random seed for reproducible experiments torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Get the logger utils.set_logger(os.path.join(args.model_dir, 'evaluate.log')) # Create the input data pipeline logging.info("Creating the dataset...") # load data data_loader = DataLoader(args.data_dir, params) data = data_loader.load_data(['test'], args.data_dir) test_data = data['test'] # specify the test set size params.test_size = test_data['size'] test_data_iterator = data_loader.data_iterator(test_data, params) logging.info("- done.") # Load embeddings gen_emb = np.load(os.path.join(args.emb_dir, 'gen.npy')) domain_emb = np.load(os.path.join(args.emb_dir, 'domain.npy')) # Define the model model = net.Net(params, gen_emb, domain_emb).cuda() if params.cuda else net.Net(params, gen_emb, domain_emb)
import test_context from model.data_loader import DataLoader dl = DataLoader('data/xml/CoreOnly.xml') dl.print_stats()
# use GPU if available params.cuda = torch.cuda.is_available() # Set the random seed for reproducible experiments torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Set the logger utils.set_logger(os.path.join(args.model_dir, 'train.log')) # Create the input data pipeline logging.info("Loading the datasets...") # load data data_loader = DataLoader(args.data_dir, params) data = data_loader.load_data(['train', 'val'], args.data_dir) train_data = data['train'] val_data = data['val'] # specify the train and val dataset sizes params.train_size = train_data['size'] params.val_size = val_data['size'] logging.info("- done.") # Define the model and optimizer model = net.Net(params).cuda() if params.cuda else net.Net(params) torch.save(model, 'model_for_visulization.pth') optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)
# use GPU if available params.cuda = torch.cuda.is_available() # use GPU is available # Set the random seed for reproducible experiments torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Get the logger utils.set_logger(os.path.join(args.model_dir, 'evaluate.log')) # Create the input data pipeline logging.info("Creating the dataset...") # load data print(params) data_loader = DataLoader(params) data = data_loader.load_data(args.test_data_path) # specify the test set size params.test_size = data_loader.get_dataset_size('all') test_data_iterator = data_loader.data_iterator(split='all', batch_size=params.batch_size) logging.info("- done.") # Define the model model = net.Model(params).cuda() if params.cuda else net.Model(params) loss_fn = model.loss_fn metrics = { 'EM': model.exact_match_score, 'f1': model.f1_score