Exemple #1
0
def _test_conversion(settings, level='token'):
    reader = Reader(settings, settings.input_path)
    label_encoder = MultiLabelEncoder.from_settings(settings)
    label_encoder.fit_reader(reader)
    data = Dataset(settings, reader, label_encoder)

    le = label_encoder.tasks['lemma']
    for (inp, tasks), (rinp, rtasks) in data.batch_generator(return_raw=True):
        # preds
        tinp, tlen = tasks['lemma']
        preds = [
            le.stringify(t, l)
            for t, l in zip(tinp.t().tolist(), tlen.tolist())
        ]
        if level == 'token':
            preds = [w for line in preds for w in line]
        # tokens
        tokens = [tok for line in rinp for tok in line]
        # trues
        trues = [w for line in rtasks for w in line['lemma']]

        # check
        for pred, token, true in zip(preds, tokens, trues):
            rec = le.preprocessor_fn.inverse_transform(pred, token)
            assert rec == true, (pred, token, true, rec)
Exemple #2
0
 def setUp(self):
     settings = settings_from_file(testpath)
     settings['batch_size'] = 1
     reader = Reader(settings, settings.input_path)
     label_encoder = MultiLabelEncoder.from_settings(settings)
     insts = label_encoder.fit(line for _, line in reader.readsents())
     self.insts = insts
     self.num_batches = insts // settings.batch_size
     self.data = Dataset(settings, reader, label_encoder)
Exemple #3
0
    def load(fpath):
        """
        Load model from path
        """
        import pie

        with tarfile.open(utils.ensure_ext(fpath, 'tar'), 'r') as tar:
            # check commit
            try:
                commit = utils.get_gzip_from_tar(tar, 'pie-commit.zip')
            except Exception:
                commit = None
            if (pie.__commit__ and commit) and pie.__commit__ != commit:
                logging.warn(
                    ("Model {} was serialized with a previous "
                     "version of `pie`. This might result in issues. "
                     "Model commit is {}, whereas current `pie` commit is {}."
                     ).format(fpath, commit, pie.__commit__))

            # load label encoder
            le = MultiLabelEncoder.load_from_string(
                utils.get_gzip_from_tar(tar, 'label_encoder.zip'))

            # load tasks
            tasks = json.loads(utils.get_gzip_from_tar(tar, 'tasks.zip'))

            # load model parameters
            params = json.loads(utils.get_gzip_from_tar(tar, 'parameters.zip'))

            # instantiate model
            model_type = getattr(pie.models,
                                 utils.get_gzip_from_tar(tar, 'class.zip'))
            with utils.shutup():
                model = model_type(le, tasks, *params['args'],
                                   **params['kwargs'])

            # load settings
            try:
                settings = Settings(
                    json.loads(utils.get_gzip_from_tar(tar, 'settings.zip')))
                model._settings = settings
            except Exception:
                logging.warn(
                    "Couldn't load settings for model {}!".format(fpath))

            # load state_dict
            with utils.tmpfile() as tmppath:
                tar.extract('state_dict.pt', path=tmppath)
                dictpath = os.path.join(tmppath, 'state_dict.pt')
                model.load_state_dict(torch.load(dictpath, map_location='cpu'))

        model.eval()

        return model
Exemple #4
0
    def test_serialization(self):
        le = self.data.label_encoder
        le.save('/tmp/encoder.json')
        le2 = MultiLabelEncoder.load_from_file('/tmp/encoder.json')

        self.assertEqual(len(le.tasks), len(le2.tasks),
                         "Unequal number of Modality encoders")

        self.assertEqual(le.word, le2.word, "word encoder")
        self.assertEqual(le.char, le2.char, "char encoder")

        for task in le.tasks:
            self.assertTrue(
                le.tasks[task] == le2.tasks[task],
                "Unequal serialized label encoder for task {}".format(task))
Exemple #5
0
    def load(fpath):
        """
        Load model from path
        """
        import pie

        with tarfile.open(utils.ensure_ext(fpath, 'tar'), 'r') as tar:
            # check commit
            try:
                commit = get_gzip_from_tar(tar, 'pie-commit.zip')
            except Exception:
                # no commit in file
                commit = None
            if pie.__commit__ is not None and commit is not None \
               and pie.__commit__ != commit:
                logging.warn(
                    ("Model {} was serialized with a previous "
                     "version of `pie`. This might result in issues. "
                     "Model commit is {}, whereas current `pie` commit is {}."
                     ).format(fpath, commit, pie.__commit__))
            # load label encoder
            le = MultiLabelEncoder.load_from_string(
                get_gzip_from_tar(tar, 'label_encoder.zip'))
            # load model parameters
            params = json.loads(get_gzip_from_tar(tar, 'parameters.zip'))
            # instantiate model
            model_type = getattr(pie.models,
                                 get_gzip_from_tar(tar, 'class.zip'))
            with utils.shutup():
                model = model_type(le, *params['args'], **params['kwargs'])
            # (optional) load settings
            try:
                settings = Settings(
                    json.loads(get_gzip_from_tar(tar, 'settings.zip')))
                model._settings = settings
            except:
                pass
            # load state_dict
            tmppath = '/tmp/{}'.format(str(uuid.uuid1()))
            tar.extract('state_dict.pt', path=tmppath)
            model.load_state_dict(
                torch.load(os.path.join(tmppath, 'state_dict.pt')))
            shutil.rmtree(tmppath)

        model.eval()

        return model
Exemple #6
0
    def test_batch_level(self):
        settings = settings_from_file(testpath)
        settings['batch_size'] = 20
        reader = Reader(settings, settings.input_path)
        label_encoder = MultiLabelEncoder.from_settings(settings)
        label_encoder.fit(line for _, line in reader.readsents())
        data = Dataset(settings, reader, label_encoder)

        pre_batches = 0
        for batch in data.batch_generator():
            pre_batches += 1

        self.assertAlmostEqual(pre_batches, self.insts // 20, delta=delta)

        devset = data.get_dev_split(self.insts, split=0.05)

        post_batches = 0
        for batch in data.batch_generator():
            post_batches += 1

        self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta)
Exemple #7
0
 def setUp(self):
     settings = settings_from_file(testpath)
     reader = Reader(settings, settings.input_path)
     label_encoder = MultiLabelEncoder.from_settings(settings)
     label_encoder.fit_reader(reader)
     self.data = Dataset(settings, reader, label_encoder)
Exemple #8
0
import uuid
import torch
import os
import unittest

from pie.models import SimpleModel
from pie.data import MultiLabelEncoder, Reader, Dataset
from pie.settings import settings_from_file

testpath = os.path.join(os.path.dirname(__file__), 'testconfig.json')
settings = settings_from_file(testpath)
label_encoder = MultiLabelEncoder.from_settings(settings)
reader = Reader(settings, settings.input_path)
label_encoder.fit_reader(reader)
dataset = Dataset(settings, label_encoder, reader)


class TestModelSerialization(unittest.TestCase):
    def setUp(self):
        emb_dim, hidden_size, num_layers = 64, 100, 1
        self.model = SimpleModel(label_encoder, emb_dim, emb_dim, hidden_size,
                                 num_layers)

    def test_serialization(self):
        model = self.model
        fid = '/tmp/{}'.format(str(uuid.uuid1()))
        model.save(fid)
        model2 = SimpleModel.load(fid)
        os.remove('{}.tar'.format(fid))
        self.assertEqual(model.label_encoder, model2.label_encoder)
Exemple #9
0
    return fname, infix


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('config_path', nargs='?', default='config.json')
    args = parser.parse_args()

    settings = settings_from_file(args.config_path)

    # datasets
    reader = Reader(settings, settings.input_path)
    tasks = reader.check_tasks(expected=None)
    label_encoder = MultiLabelEncoder.from_settings(settings, tasks=tasks)
    if settings.verbose:
        print("::: Available tasks :::")
        print()
        for task in tasks:
            print("- {}".format(task))
        print()

    # fit
    start = time.time()
    if settings.verbose:
        print("::: Fitting data :::")
        print()
    ninsts = label_encoder.fit_reader(reader)
    if settings.verbose:
        print("Found {} total instances in training set in {:g} secs".format(
Exemple #10
0
 def setUp(self):
     settings = settings_from_file(testpath)
     reader = Reader(settings, settings.input_path)
     label_encoder = MultiLabelEncoder.from_settings(settings)
     label_encoder.fit(line for _, line in reader.readsents())
     self.data = Dataset(settings, reader, label_encoder)
Exemple #11
0
def run(config_path):
    now = datetime.now()
    seed = now.hour * 10000 + now.minute * 100 + now.second
    print("Using seed:", seed)
    random.seed(seed)
    numpy.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

    settings = settings_from_file(config_path)

    # check settings
    # - check at least and at most one target
    has_target = False
    for task in settings.tasks:
        if len(settings.tasks) == 1:
            task['target'] = True
        if task.get('target', False):
            if has_target:
                raise ValueError("Got more than one target task")
            has_target = True
    if not has_target:
        raise ValueError("Needs at least one target task")

    # datasets
    reader = Reader(settings, settings.input_path)
    tasks = reader.check_tasks(expected=None)
    if settings.verbose:
        print("::: Available tasks :::")
        print()
        for task in tasks:
            print("- {}".format(task))
        print()

    # label encoder
    label_encoder = MultiLabelEncoder.from_settings(settings, tasks=tasks)
    if settings.verbose:
        print("::: Fitting data :::")
        print()
    label_encoder.fit_reader(reader)

    if settings.verbose:
        print()
        print("::: Vocabulary :::")
        print()
        types = '{}/{}={:.2f}'.format(*label_encoder.word.get_type_stats())
        tokens = '{}/{}={:.2f}'.format(*label_encoder.word.get_token_stats())
        print("- {:<15} types={:<10} tokens={:<10}".format(
            "word", types, tokens))
        types = '{}/{}={:.2f}'.format(*label_encoder.char.get_type_stats())
        tokens = '{}/{}={:.2f}'.format(*label_encoder.char.get_token_stats())
        print("- {:<15} types={:<10} tokens={:<10}".format(
            "char", types, tokens))
        print()
        print("::: Tasks :::")
        print()
        for task, le in label_encoder.tasks.items():
            print("- {:<15} target={:<6} level={:<6} vocab={:<6}".format(
                task, le.target, le.level, len(le)))
        print()

    trainset = Dataset(settings, reader, label_encoder)

    devset = None
    if settings.dev_path:
        devset = Dataset(settings, Reader(settings, settings.dev_path),
                         label_encoder)
    else:
        logging.warning("No devset: cannot monitor/optimize training")

    # model
    model = SimpleModel(label_encoder,
                        settings.tasks,
                        settings.wemb_dim,
                        settings.cemb_dim,
                        settings.hidden_size,
                        settings.num_layers,
                        dropout=settings.dropout,
                        cell=settings.cell,
                        cemb_type=settings.cemb_type,
                        cemb_layers=settings.cemb_layers,
                        custom_cemb_cell=settings.custom_cemb_cell,
                        linear_layers=settings.linear_layers,
                        scorer=settings.scorer,
                        word_dropout=settings.word_dropout,
                        lm_shared_softmax=settings.lm_shared_softmax,
                        include_lm=settings.include_lm)

    # pretrain(/load pretrained) embeddings
    if model.wemb is not None:
        if settings.pretrain_embeddings:
            print("Pretraining word embeddings")
            wemb_reader = Reader(settings, settings.input_path,
                                 settings.dev_path, settings.test_path)
            weight = get_pretrained_embeddings(wemb_reader,
                                               label_encoder,
                                               size=settings.wemb_dim,
                                               window=5,
                                               negative=5,
                                               min_count=1)
            model.wemb.weight.data = torch.tensor(weight, dtype=torch.float32)

        elif settings.load_pretrained_embeddings:
            print("Loading pretrained embeddings")
            if not os.path.isfile(settings.load_pretrained_embeddings):
                print("Couldn't find pretrained eembeddings in: {}".format(
                    settings.load_pretrained_embeddings))
            initialization.init_pretrained_embeddings(
                settings.load_pretrained_embeddings, label_encoder.word,
                model.wemb)

    # load pretrained weights
    if settings.load_pretrained_encoder:
        model.init_from_encoder(
            pie.Encoder.load(settings.load_pretrained_encoder))

    # freeze embeddings
    if settings.freeze_embeddings:
        model.wemb.weight.requires_grad = False

    model.to(settings.device)

    print("::: Model :::")
    print()
    print(model)
    print()
    print("::: Model parameters :::")
    print()
    trainable = sum(p.nelement() for p in model.parameters()
                    if p.requires_grad)
    total = sum(p.nelement() for p in model.parameters())
    print("{}/{} trainable/total".format(trainable, total))
    print()

    # training
    print("Starting training")

    running_time = time.time()
    trainer = Trainer(settings, model, trainset, reader.get_nsents())
    scores = None
    try:
        scores = trainer.train_epochs(settings.epochs, devset=devset)
    except KeyboardInterrupt:
        print("Stopping training")
    finally:
        model.eval()
    running_time = time.time() - running_time

    if settings.test_path:
        print("Evaluating model on test set")
        testset = Dataset(settings, Reader(settings, settings.test_path),
                          label_encoder)
        for task in model.evaluate(testset, trainset).values():
            task.print_summary()

    # save model
    fpath, infix = get_fname_infix(settings)
    if not settings.run_test:
        fpath = model.save(fpath, infix=infix, settings=settings)
        print("Saved best model to: [{}]".format(fpath))

    if devset is not None and not settings.run_test:
        scorers = model.evaluate(devset, trainset)
        scores = []
        for task in sorted(scorers):
            scorer = scorers[task]
            result = scorer.get_scores()
            for acc in result:
                scores.append('{}:{:.6f}'.format(task,
                                                 result[acc]['accuracy']))
                scores.append('{}-support:{}'.format(task,
                                                     result[acc]['support']))
        path = '{}.results.{}.csv'.format(settings.modelname,
                                          '-'.join(get_targets(settings)))
        with open(path, 'a') as f:
            line = [infix, str(seed), str(running_time)]
            line += scores
            f.write('{}\n'.format('\t'.join(line)))

    print("Bye!")
Exemple #12
0
    from pie import utils
    with utils.shutup():        # avoid pattern warning
        from gensim.models import Word2Vec

    word2vec = Word2Vec(reader.get_token_iterator(), **kwargs)
    weight = np.zeros((len(label_encoder.word), word2vec.wv.vector_size))

    found = 0
    for w, idx in label_encoder.word.table.items():
        try:
            weight[idx] = word2vec.wv[w]
            found += 1
        except KeyError:  # reserved symbols are not in training sentences
            pass

    print("A total of {}/{} word embeddings were pretrained"
          .format(found, len(label_encoder.word)))

    return weight


if __name__ == '__main__':
    from pie.data import Reader, MultiLabelEncoder
    from pie.settings import settings_from_file

    settings = settings_from_file("config.json")
    reader = Reader(settings, settings.input_path)
    le = MultiLabelEncoder.from_settings(settings)
    le.fit_reader(reader)
    get_pretrained_embeddings(reader, le, min_count=1)