Esempio n. 1
0
def run(model_path, test_path, train_path, settings, batch_size, buffer_size,
        device, model_info, full, confusion):

    model = BaseModel.load(model_path).to(device)
    if model_info:
        print(model)

    if hasattr(model, '_settings'):  # new models should all have _settings
        settings = model._settings
    elif settings:
        with utils.shutup():
            settings = settings_from_file(settings)
    else:
        with utils.shutup():
            settings = load_default_settings()

    # overwrite defaults
    settings.batch_size = batch_size
    settings.buffer_size = buffer_size
    settings.device = device

    trainset = None
    if train_path:
        trainset = Dataset(settings, Reader(settings, train_path),
                           model.label_encoder)

    testset = Dataset(settings, Reader(settings, *test_path),
                      model.label_encoder)

    for task in model.evaluate(testset, trainset).values():
        task.print_summary(full=full, confusion_matrix=confusion)
Esempio n. 2
0
    def compute_scores(
            trues: List[Tuple[str, str]],
            preds: List[Tuple[str, str]]) -> Dict[str, Union[float, int]]:
        """ Static method that replaces scorer.compute_scores

        Issue raised to make it a static in the original code : https://github.com/emanjavacas/pie/issues/30
        """
        def format_score(score):
            return round(float(score), 4)

        # Trues and preds are tuples
        #  This is seen as a multiclass variable by sklearn
        #  So we move this to a string

        trues, preds = zip(*[("##".join(true), "##".join(pred))
                             for true, pred in zip(trues, preds)])

        with utils.shutup():
            p = format_score(precision_score(trues, preds, average='macro'))
            r = format_score(recall_score(trues, preds, average='macro'))
            a = format_score(accuracy_score(trues, preds))

        return {
            'accuracy': a,
            'precision': p,
            'recall': r,
            'support': len(trues)
        }
Esempio n. 3
0
def run(model_spec, device, batch_size, lower, beam_width, use_beam, tokenize):
    with utils.shutup():
        tagger = Tagger(device=device, batch_size=batch_size, lower=lower)

        for model, tasks in model_spec:
            tagger.add_model(model, *tasks)
            tasks = tasks or tagger.models[-1][0].label_encoder.tasks

    header = False
    for line in sys.stdin:
        if not line:
            continue

        if tokenize:
            line = simple_tokenizer(line, lower)
        else:
            line = line.split()

        preds, tasks = tagger.tag([line], [len(line)],
                                  use_beam=use_beam,
                                  beam_width=beam_width)

        if not header:
            print('\t'.join(['token'] + tasks))
            header = True

        preds = preds[0]  # unpack
        tokens, tags = zip(*preds)
        for token, tags in zip(tokens, tags):
            print('\t'.join([token] + list(tags)))
Esempio n. 4
0
def compute_scores(trues, preds):
    def format_score(score):
        return round(float(score), 4)

    with utils.shutup():
        p = format_score(precision_score(trues, preds, average='macro'))
        r = format_score(recall_score(trues, preds, average='macro'))
        a = format_score(accuracy_score(trues, preds))

    return {'accuracy': a, 'precision': p, 'recall': r, 'support': len(trues)}
Esempio n. 5
0
    def load(fpath):
        """
        Load model from path
        """
        import pie

        with tarfile.open(utils.ensure_ext(fpath, 'tar'), 'r') as tar:
            # check commit
            try:
                commit = utils.get_gzip_from_tar(tar, 'pie-commit.zip')
            except Exception:
                commit = None
            if (pie.__commit__ and commit) and pie.__commit__ != commit:
                logging.warn(
                    ("Model {} was serialized with a previous "
                     "version of `pie`. This might result in issues. "
                     "Model commit is {}, whereas current `pie` commit is {}."
                     ).format(fpath, commit, pie.__commit__))

            # load label encoder
            le = MultiLabelEncoder.load_from_string(
                utils.get_gzip_from_tar(tar, 'label_encoder.zip'))

            # load tasks
            tasks = json.loads(utils.get_gzip_from_tar(tar, 'tasks.zip'))

            # load model parameters
            params = json.loads(utils.get_gzip_from_tar(tar, 'parameters.zip'))

            # instantiate model
            model_type = getattr(pie.models,
                                 utils.get_gzip_from_tar(tar, 'class.zip'))
            with utils.shutup():
                model = model_type(le, tasks, *params['args'],
                                   **params['kwargs'])

            # load settings
            try:
                settings = Settings(
                    json.loads(utils.get_gzip_from_tar(tar, 'settings.zip')))
                model._settings = settings
            except Exception:
                logging.warn(
                    "Couldn't load settings for model {}!".format(fpath))

            # load state_dict
            with utils.tmpfile() as tmppath:
                tar.extract('state_dict.pt', path=tmppath)
                dictpath = os.path.join(tmppath, 'state_dict.pt')
                model.load_state_dict(torch.load(dictpath, map_location='cpu'))

        model.eval()

        return model
Esempio n. 6
0
    def load(fpath):
        """
        Load model from path
        """
        import pie

        with tarfile.open(utils.ensure_ext(fpath, 'tar'), 'r') as tar:
            # check commit
            try:
                commit = get_gzip_from_tar(tar, 'pie-commit.zip')
            except Exception:
                # no commit in file
                commit = None
            if pie.__commit__ is not None and commit is not None \
               and pie.__commit__ != commit:
                logging.warn(
                    ("Model {} was serialized with a previous "
                     "version of `pie`. This might result in issues. "
                     "Model commit is {}, whereas current `pie` commit is {}."
                     ).format(fpath, commit, pie.__commit__))
            # load label encoder
            le = MultiLabelEncoder.load_from_string(
                get_gzip_from_tar(tar, 'label_encoder.zip'))
            # load model parameters
            params = json.loads(get_gzip_from_tar(tar, 'parameters.zip'))
            # instantiate model
            model_type = getattr(pie.models,
                                 get_gzip_from_tar(tar, 'class.zip'))
            with utils.shutup():
                model = model_type(le, *params['args'], **params['kwargs'])
            # (optional) load settings
            try:
                settings = Settings(
                    json.loads(get_gzip_from_tar(tar, 'settings.zip')))
                model._settings = settings
            except:
                pass
            # load state_dict
            tmppath = '/tmp/{}'.format(str(uuid.uuid1()))
            tar.extract('state_dict.pt', path=tmppath)
            model.load_state_dict(
                torch.load(os.path.join(tmppath, 'state_dict.pt')))
            shutil.rmtree(tmppath)

        model.eval()

        return model
Esempio n. 7
0
def get_pretrained_embeddings(reader, label_encoder, **kwargs):
    from pie import utils
    with utils.shutup():        # avoid pattern warning
        from gensim.models import Word2Vec

    word2vec = Word2Vec(reader.get_token_iterator(), **kwargs)
    weight = np.zeros((len(label_encoder.word), word2vec.wv.vector_size))

    found = 0
    for w, idx in label_encoder.word.table.items():
        try:
            weight[idx] = word2vec.wv[w]
            found += 1
        except KeyError:  # reserved symbols are not in training sentences
            pass

    print("A total of {}/{} word embeddings were pretrained"
          .format(found, len(label_encoder.word)))

    return weight
Esempio n. 8
0
    def load(fpath):
        """
        Load model from path
        """
        import tarte.modules.models

        with tarfile.open(utils.ensure_ext(fpath, 'tar'), 'r') as tar:

            # load label encoder
            le = MultiEncoder.load(
                json.loads(utils.get_gzip_from_tar(tar, 'label_encoder.zip')))

            # load model parameters
            args, kwargs = json.loads(
                utils.get_gzip_from_tar(tar, 'parameters.zip'))

            # instantiate model
            model_type = getattr(tarte.modules.models,
                                 utils.get_gzip_from_tar(tar, 'class.zip'))
            with utils.shutup():
                model = model_type(le, *args, **kwargs)

            # load settings
            try:
                settings = Settings(
                    json.loads(utils.get_gzip_from_tar(tar, 'settings.zip')))
                model._settings = settings
            except Exception:
                logging.warn(
                    "Couldn't load settings for model {}!".format(fpath))

            # load state_dict
            with utils.tmpfile() as tmppath:
                tar.extract('state_dict.pt', path=tmppath)
                dictpath = os.path.join(tmppath, 'state_dict.pt')
                model.load_state_dict(torch.load(dictpath, map_location='cpu'))

        model.eval()

        return model
import os
from typing import Optional, Dict, Generator, Type

from pie.utils import shutup

with shutup():
    from pie.tagger import Tagger
    from pie import utils

from .pipeline.formatters.proto import Formatter
from .pipeline.disambiguators.proto import Disambiguator
from .pipeline.iterators.proto import DataIterator
from .pipeline.postprocessor.proto import ProcessorPrototype


class ExtensibleTagger(Tagger):
    def __init__(self, device='cpu', batch_size=100, lower=False, disambiguation=None):
        super(ExtensibleTagger, self).__init__(
            device=device,
            batch_size=batch_size,
            lower=lower
        )
        self.disambiguation: Optional[Disambiguator] = disambiguation

    def tag_file(self, fpath: str, iterator: DataIterator, processor: ProcessorPrototype, no_tokenizer: bool = False):
        # Read content of the file
        with open(fpath) as f:
            data = f.read()

        _, ext = os.path.splitext(fpath)
Esempio n. 10
0
    parser.add_argument('--settings', help="settings file used for training")
    parser.add_argument('--batch_size', type=int, default=500)
    parser.add_argument('--buffer_size', type=int, default=100000)
    parser.add_argument('--device', default='cpu')
    parser.add_argument('--model_info', action='store_true')
    parser.add_argument('--full', action='store_true')
    args = parser.parse_args()

    model = BaseModel.load(args.model_path).to(args.device)
    if args.model_info:
        print(model)

    if hasattr(model, '_settings'):  # new models should all have _settings
        settings = model._settings
    elif args.settings:
        with utils.shutup():
            settings = settings_from_file(args.settings)
    else:
        with utils.shutup():
            settings = load_default_settings()

    # overwrite defaults
    settings.batch_size = args.batch_size
    settings.buffer_size = args.buffer_size
    settings.device = args.device

    reader = Reader(settings, *args.test_path)
    dataset = Dataset(settings, reader, model.label_encoder)
    dataset = device_wrapper(list(dataset.batch_generator()), args.device)

    for task in model.evaluate(dataset).values():