def run(model_path, test_path, train_path, settings, batch_size, buffer_size, device, model_info, full, confusion): model = BaseModel.load(model_path).to(device) if model_info: print(model) if hasattr(model, '_settings'): # new models should all have _settings settings = model._settings elif settings: with utils.shutup(): settings = settings_from_file(settings) else: with utils.shutup(): settings = load_default_settings() # overwrite defaults settings.batch_size = batch_size settings.buffer_size = buffer_size settings.device = device trainset = None if train_path: trainset = Dataset(settings, Reader(settings, train_path), model.label_encoder) testset = Dataset(settings, Reader(settings, *test_path), model.label_encoder) for task in model.evaluate(testset, trainset).values(): task.print_summary(full=full, confusion_matrix=confusion)
def compute_scores( trues: List[Tuple[str, str]], preds: List[Tuple[str, str]]) -> Dict[str, Union[float, int]]: """ Static method that replaces scorer.compute_scores Issue raised to make it a static in the original code : https://github.com/emanjavacas/pie/issues/30 """ def format_score(score): return round(float(score), 4) # Trues and preds are tuples # This is seen as a multiclass variable by sklearn # So we move this to a string trues, preds = zip(*[("##".join(true), "##".join(pred)) for true, pred in zip(trues, preds)]) with utils.shutup(): p = format_score(precision_score(trues, preds, average='macro')) r = format_score(recall_score(trues, preds, average='macro')) a = format_score(accuracy_score(trues, preds)) return { 'accuracy': a, 'precision': p, 'recall': r, 'support': len(trues) }
def run(model_spec, device, batch_size, lower, beam_width, use_beam, tokenize): with utils.shutup(): tagger = Tagger(device=device, batch_size=batch_size, lower=lower) for model, tasks in model_spec: tagger.add_model(model, *tasks) tasks = tasks or tagger.models[-1][0].label_encoder.tasks header = False for line in sys.stdin: if not line: continue if tokenize: line = simple_tokenizer(line, lower) else: line = line.split() preds, tasks = tagger.tag([line], [len(line)], use_beam=use_beam, beam_width=beam_width) if not header: print('\t'.join(['token'] + tasks)) header = True preds = preds[0] # unpack tokens, tags = zip(*preds) for token, tags in zip(tokens, tags): print('\t'.join([token] + list(tags)))
def compute_scores(trues, preds): def format_score(score): return round(float(score), 4) with utils.shutup(): p = format_score(precision_score(trues, preds, average='macro')) r = format_score(recall_score(trues, preds, average='macro')) a = format_score(accuracy_score(trues, preds)) return {'accuracy': a, 'precision': p, 'recall': r, 'support': len(trues)}
def load(fpath): """ Load model from path """ import pie with tarfile.open(utils.ensure_ext(fpath, 'tar'), 'r') as tar: # check commit try: commit = utils.get_gzip_from_tar(tar, 'pie-commit.zip') except Exception: commit = None if (pie.__commit__ and commit) and pie.__commit__ != commit: logging.warn( ("Model {} was serialized with a previous " "version of `pie`. This might result in issues. " "Model commit is {}, whereas current `pie` commit is {}." ).format(fpath, commit, pie.__commit__)) # load label encoder le = MultiLabelEncoder.load_from_string( utils.get_gzip_from_tar(tar, 'label_encoder.zip')) # load tasks tasks = json.loads(utils.get_gzip_from_tar(tar, 'tasks.zip')) # load model parameters params = json.loads(utils.get_gzip_from_tar(tar, 'parameters.zip')) # instantiate model model_type = getattr(pie.models, utils.get_gzip_from_tar(tar, 'class.zip')) with utils.shutup(): model = model_type(le, tasks, *params['args'], **params['kwargs']) # load settings try: settings = Settings( json.loads(utils.get_gzip_from_tar(tar, 'settings.zip'))) model._settings = settings except Exception: logging.warn( "Couldn't load settings for model {}!".format(fpath)) # load state_dict with utils.tmpfile() as tmppath: tar.extract('state_dict.pt', path=tmppath) dictpath = os.path.join(tmppath, 'state_dict.pt') model.load_state_dict(torch.load(dictpath, map_location='cpu')) model.eval() return model
def load(fpath): """ Load model from path """ import pie with tarfile.open(utils.ensure_ext(fpath, 'tar'), 'r') as tar: # check commit try: commit = get_gzip_from_tar(tar, 'pie-commit.zip') except Exception: # no commit in file commit = None if pie.__commit__ is not None and commit is not None \ and pie.__commit__ != commit: logging.warn( ("Model {} was serialized with a previous " "version of `pie`. This might result in issues. " "Model commit is {}, whereas current `pie` commit is {}." ).format(fpath, commit, pie.__commit__)) # load label encoder le = MultiLabelEncoder.load_from_string( get_gzip_from_tar(tar, 'label_encoder.zip')) # load model parameters params = json.loads(get_gzip_from_tar(tar, 'parameters.zip')) # instantiate model model_type = getattr(pie.models, get_gzip_from_tar(tar, 'class.zip')) with utils.shutup(): model = model_type(le, *params['args'], **params['kwargs']) # (optional) load settings try: settings = Settings( json.loads(get_gzip_from_tar(tar, 'settings.zip'))) model._settings = settings except: pass # load state_dict tmppath = '/tmp/{}'.format(str(uuid.uuid1())) tar.extract('state_dict.pt', path=tmppath) model.load_state_dict( torch.load(os.path.join(tmppath, 'state_dict.pt'))) shutil.rmtree(tmppath) model.eval() return model
def get_pretrained_embeddings(reader, label_encoder, **kwargs): from pie import utils with utils.shutup(): # avoid pattern warning from gensim.models import Word2Vec word2vec = Word2Vec(reader.get_token_iterator(), **kwargs) weight = np.zeros((len(label_encoder.word), word2vec.wv.vector_size)) found = 0 for w, idx in label_encoder.word.table.items(): try: weight[idx] = word2vec.wv[w] found += 1 except KeyError: # reserved symbols are not in training sentences pass print("A total of {}/{} word embeddings were pretrained" .format(found, len(label_encoder.word))) return weight
def load(fpath): """ Load model from path """ import tarte.modules.models with tarfile.open(utils.ensure_ext(fpath, 'tar'), 'r') as tar: # load label encoder le = MultiEncoder.load( json.loads(utils.get_gzip_from_tar(tar, 'label_encoder.zip'))) # load model parameters args, kwargs = json.loads( utils.get_gzip_from_tar(tar, 'parameters.zip')) # instantiate model model_type = getattr(tarte.modules.models, utils.get_gzip_from_tar(tar, 'class.zip')) with utils.shutup(): model = model_type(le, *args, **kwargs) # load settings try: settings = Settings( json.loads(utils.get_gzip_from_tar(tar, 'settings.zip'))) model._settings = settings except Exception: logging.warn( "Couldn't load settings for model {}!".format(fpath)) # load state_dict with utils.tmpfile() as tmppath: tar.extract('state_dict.pt', path=tmppath) dictpath = os.path.join(tmppath, 'state_dict.pt') model.load_state_dict(torch.load(dictpath, map_location='cpu')) model.eval() return model
import os from typing import Optional, Dict, Generator, Type from pie.utils import shutup with shutup(): from pie.tagger import Tagger from pie import utils from .pipeline.formatters.proto import Formatter from .pipeline.disambiguators.proto import Disambiguator from .pipeline.iterators.proto import DataIterator from .pipeline.postprocessor.proto import ProcessorPrototype class ExtensibleTagger(Tagger): def __init__(self, device='cpu', batch_size=100, lower=False, disambiguation=None): super(ExtensibleTagger, self).__init__( device=device, batch_size=batch_size, lower=lower ) self.disambiguation: Optional[Disambiguator] = disambiguation def tag_file(self, fpath: str, iterator: DataIterator, processor: ProcessorPrototype, no_tokenizer: bool = False): # Read content of the file with open(fpath) as f: data = f.read() _, ext = os.path.splitext(fpath)
parser.add_argument('--settings', help="settings file used for training") parser.add_argument('--batch_size', type=int, default=500) parser.add_argument('--buffer_size', type=int, default=100000) parser.add_argument('--device', default='cpu') parser.add_argument('--model_info', action='store_true') parser.add_argument('--full', action='store_true') args = parser.parse_args() model = BaseModel.load(args.model_path).to(args.device) if args.model_info: print(model) if hasattr(model, '_settings'): # new models should all have _settings settings = model._settings elif args.settings: with utils.shutup(): settings = settings_from_file(args.settings) else: with utils.shutup(): settings = load_default_settings() # overwrite defaults settings.batch_size = args.batch_size settings.buffer_size = args.buffer_size settings.device = args.device reader = Reader(settings, *args.test_path) dataset = Dataset(settings, reader, model.label_encoder) dataset = device_wrapper(list(dataset.batch_generator()), args.device) for task in model.evaluate(dataset).values():