def eval(self, json_path, batch_size=100, lowercase=True, ignore_cpos=False, cui_filter=None, score_average='weighted', replace_center=None): data = json.load(open(json_path, 'r')) # Prepare the data data = prepare_from_json(data, self.cntx_left, self.cntx_right, self.tokenizer, lowercase=lowercase, cui_filter=cui_filter, replace_center=replace_center) # Check is the name there if self.category_name not in data: raise Exception("The category name does not exist in this json file.") data = data[self.category_name] # We already have everything, just get the data data, _ = encode_category_values(data, vals=self.category_values) # Convert data tkns to ids data = tkns_to_ids(data, self.tokenizer) # Run evaluation result = eval_network(self.model, data, max_seq_len=(self.cntx_left+self.cntx_right+1), pad_id=self.pad_id, batch_size=batch_size, device=self.device, ignore_cpos=ignore_cpos, score_average=score_average) return result
def train(self, json_path, category_name=None, model_name='BERT_GRU', Bio_BERT_PATH=None, lr=0.01, test_size=0.1, batch_size=100, nepochs=20, lowercase=True, class_weights=None, cv=0, ignore_cpos=False, model_config={}, tui_filter=None, fine_tune=False, auto_save_model=True, score_average='weighted', replace_center=None, seed=11): r''' TODO: Docs ''' set_all_seeds(seed) data = json.load(open(json_path, 'r')) # Create directories if they don't exist if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) # Prepare the data data = prepare_from_json(data, self.cntx_left, self.cntx_right, self.tokenizer, lowercase=lowercase, tui_filter=tui_filter, replace_center=replace_center) if category_name is not None: self.category_name = category_name # Check is the name there if self.category_name not in data: raise Exception( "The category name does not exist in this json file. You've provided '{}', while the possible options are: {}" .format(self.category_name, " | ".join(list(data.keys())))) data = data[self.category_name] if not fine_tune: # Encode the category values data, self.category_values = encode_category_values(data) self.i_category_values = { v: k for k, v in self.category_values.items() } else: # We already have everything, just get the data data, _ = encode_category_values(data, vals=self.category_values) # Convert data tkns to ids data = tkns_to_ids(data, self.tokenizer) if not fine_tune: if model_name == 'lstm': from medcat.utils.models import LSTM nclasses = len(self.category_values) bid = model_config.get("bid", True) num_layers = model_config.get("num_layers", 2) input_size = model_config.get("input_size", 300) hidden_size = model_config.get("hidden_size", 300) dropout = model_config.get("dropout", 0.5) self.model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses, bid=bid, num_layers=num_layers, input_size=input_size, hidden_size=hidden_size, dropout=dropout) if model_name == 'bert_gru': from medcat.utils.models import BERT_GRU nclasses = len(self.category_values) bid = model_config.get("bid", True) num_layers = model_config.get("num_layers", 5) input_size = model_config.get("input_size", 768) hidden_size = model_config.get("hidden_size", 768) dropout = model_config.get("dropout", 0.5) self.model = BERT_GRU(Bio_BERT_PATH, nclasses=nclasses, bid=bid, num_layers=num_layers, input_size=input_size, hidden_size=hidden_size, dropout=dropout) if cv == 0: (f1, p, r, cls_report) = train_network( self.model, data, max_seq_len=(self.cntx_left + self.cntx_right + 1), lr=lr, test_size=test_size, pad_id=self.pad_id, batch_size=batch_size, nepochs=nepochs, device=self.device, class_weights=class_weights, ignore_cpos=ignore_cpos, save_dir=self.save_dir, auto_save_model=auto_save_model, score_average=score_average) elif cv > 0: # Mainly for testing, not really used in a normal workflow f1s = [] ps = [] rs = [] cls_reports = [] for i in range(cv): # Reset the model if fine_tune: self.load_model(model=model_name) else: if model_name == 'lstm': from medcat.utils.models import LSTM nclasses = len(self.category_values) self.model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses) (_f1, _p, _r, _cls_report) = train_network( self.model, data, max_seq_len=(self.cntx_left + self.cntx_right + 1), lr=lr, test_size=test_size, pad_id=self.pad_id, batch_size=batch_size, nepochs=nepochs, device=self.device, class_weights=class_weights, ignore_cpos=ignore_cpos, save_dir=self.save_dir, score_average=score_average) f1s.append(_f1) ps.append(_p) rs.append(_r) cls_reports.append(_cls_report) f1 = np.average(f1s) p = np.average(ps) r = np.average(rs) # Average cls reports cls_report = {} _cls_report = cls_reports[0] for label in _cls_report.keys(): cls_report[label] = {} if type(_cls_report[label]) == dict: for score in _cls_report[label].keys(): cls_report[label][score] = sum( [r[label][score] for r in cls_reports]) / len(cls_reports) print("Best/Average scores: F1: {}, P: {}, R: {}".format(f1, p, r)) return {'f1': f1, 'p': p, 'r': r, 'cls_report': cls_report}
def train(self, json_path, category_name=None, model_name='lstm', lr=0.01, test_size=0.1, batch_size=100, nepochs=20, lowercase=True, class_weights=None, cv=0, ignore_cpos=False, model_config={}, tui_filter=None, fine_tune=False, auto_save_model=True): r''' TODO: Docs ''' data = json.load(open(json_path, 'r')) # Create directories if they don't exist if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) # Prepare the data data = prepare_from_json(data, self.cntx_left, self.cntx_right, self.tokenizer, lowercase=lowercase, tui_filter=tui_filter) if category_name is not None: self.category_name = category_name # Check is the name there if self.category_name not in data: raise Exception( "The category name does not exist in this json file.") data = data[self.category_name] if not fine_tune: # Encode the category values data, self.category_values = encode_category_values(data) self.i_category_values = { v: k for k, v in self.category_values.items() } else: # We already have everything, just get the data data, _ = encode_category_values(data, vals=self.category_values) # Convert data tkns to ids data = tkns_to_ids(data, self.tokenizer) if not fine_tune: if model_name == 'lstm': from medcat.utils.models import LSTM nclasses = len(self.category_values) bid = model_config.get("bid", True) num_layers = model_config.get("num_layers", 2) input_size = model_config.get("input_size", 300) hidden_size = model_config.get("hidden_size", 300) dropout = model_config.get("dropout", 0.5) self.model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses, bid=bid, num_layers=num_layers, input_size=input_size, hidden_size=hidden_size, dropout=dropout) if cv == 0: (f1, p, r) = train_network(self.model, data, max_seq_len=(self.cntx_left + self.cntx_right + 1), lr=lr, test_size=test_size, pad_id=self.pad_id, batch_size=batch_size, nepochs=nepochs, device=self.device, class_weights=class_weights, ignore_cpos=ignore_cpos, save_dir=self.save_dir, auto_save_model=auto_save_model) elif cv > 0: # Mainly for testing, not really used in a normal workflow f1s = [] ps = [] rs = [] for i in range(cv): # Reset the model if fine_tune: self.load_model(model=model_name) else: if model_name == 'lstm': from medcat.utils.models import LSTM nclasses = len(self.category_values) self.model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses) (_f1, _p, _r) = train_network( self.model, data, max_seq_len=(self.cntx_left + self.cntx_right + 1), lr=lr, test_size=test_size, pad_id=self.pad_id, batch_size=batch_size, nepochs=nepochs, device=self.device, class_weights=class_weights, ignore_cpos=ignore_cpos, save_dir=self.save_dir) f1s.append(_f1) ps.append(_p) rs.append(_r) f1 = np.average(f1s) p = np.average(ps) r = np.average(rs) print("Best/Average scores: F1: {}, P: {}, R: {}".format(f1, p, r)) return {'f1': f1, 'p': p, 'r': r}
def train(self, json_path, category_name, model_name='lstm', lr=0.01, test_size=0.1, batch_size=100, nepochs=20, device='cpu', lowercase=True, class_weights=None, cv=0): data = json.load(open(json_path, 'r')) # Prepare the data data = prepare_from_json(data, self.cntx_left, self.cntx_right, self.tokenizer, lowercase=lowercase) # Check is the name there if category_name not in data: raise Exception( "The category name does not exist in this json file") data = data[category_name] # Encode the category values self.category_name = category_name data, self.category_values = encode_category_values(data) self.i_category_values = { v: k for k, v in self.category_values.items() } # Convert data tkns to ids data = tkns_to_ids(data, self.tokenizer) if model_name == 'lstm': from medcat.utils.models import LSTM nclasses = len(self.category_values) model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses) if cv == 0: (f1, p, r) = train_network(model, data, max_seq_len=(self.cntx_left + self.cntx_right + 1), lr=lr, test_size=test_size, pad_id=self.pad_id, batch_size=batch_size, nepochs=nepochs, device=device, class_weights=class_weights) elif cv > 0: # Mainly for testing, not really used in a normal workflow f1s = [] ps = [] rs = [] for i in range(cv): # Reset the model if model_name == 'lstm': from medcat.utils.models import LSTM nclasses = len(self.category_values) model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses) (_f1, _p, _r) = train_network( model, data, max_seq_len=(self.cntx_left + self.cntx_right + 1), lr=lr, test_size=test_size, pad_id=self.pad_id, batch_size=batch_size, nepochs=nepochs, device=device, class_weights=class_weights) f1s.append(_f1) ps.append(_p) rs.append(_r) f1 = np.average(f1s) p = np.average(ps) r = np.average(rs) print("Best/Average scores: F1: {}, P: {}, R: {}".format(f1, p, r)) self.model = model