Example #1
0
class LanguageModelingTask(Task):

    def __init__(self, mead_settings_config, **kwargs):
        super(LanguageModelingTask, self).__init__(mead_settings_config, **kwargs)

    @classmethod
    def task_name(cls):
        return 'lm'

    def _create_task_specific_reader(self):
        self._create_vectorizers()

        reader_params = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader']
        reader_params['nctx'] = reader_params.get('nctx', self.config_params.get('nctx', self.config_params.get('nbptt', 35)))
        reader_params['clean_fn'] = reader_params.get('clean_fn', self.config_params.get('preproc', {}).get('clean_fn'))
        if reader_params['clean_fn'] is not None and self.config_params['dataset'] != 'SST2':
            logger.warning('Warning: A reader preprocessing function (%s) is active, it is recommended that all data preprocessing is done outside of baseline to insure data at inference time matches data at training time.', reader_params['clean_fn'])
        reader_params['mxlen'] = self.vectorizers[self.primary_key].mxlen
        if self.config_params['model'].get('gpus', 1) > 1:
            reader_params['truncate'] = True
        return baseline.reader.create_reader(self.task_name(), self.vectorizers, self.config_params['preproc'].get('trim', False), **reader_params)

    def _create_backend(self, **kwargs):
        backend = Backend(self.config_params.get('backend', 'tf'))

        if backend.name == 'pytorch':
            self.config_params.get('preproc', {})['trim'] = True

        elif backend.name == 'dy':
            self.config_params.get('preproc', {})['trim'] = True
            import _dynet
            dy_params = _dynet.DynetParams()
            dy_params.from_args()
            dy_params.set_requested_gpus(1)
            if 'autobatchsz' in self.config_params['train']:
                dy_params.set_autobatch(True)
                batched = False
            else:
                batched = True
            dy_params.init()
            backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched}

        backend.load(self.task_name())
        return backend

    def initialize(self, embeddings):
        embeddings = read_config_file_or_json(embeddings, 'embeddings')
        embeddings_set = index_by_label(embeddings)
        self.dataset = DataDownloader(self.dataset, self.data_download_cache).download()
        print_dataset_info(self.dataset)
        vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']]
        # TODO: make this optional
        if 'test_file' in self.dataset:
            vocab_sources.append(self.dataset['test_file'])
        vocabs = self.reader.build_vocab(vocab_sources,
                                         min_f=Task._get_min_f(self.config_params),
                                         vocab_file=self.dataset.get('vocab_file'))
        self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocabs, self.config_params['features'])
        baseline.save_vocabs(self.get_basedir(), self.feat2index)

    def _load_dataset(self):
        read = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader']
        tgt_key = read.get('tgt_key', self.primary_key)
        bsz, vbsz, tbsz = Task._get_batchsz(self.config_params)
        self.train_data = self.reader.load(
            self.dataset['train_file'],
            self.feat2index,
            bsz,
            tgt_key=tgt_key
        )
        self.valid_data = self.reader.load(
            self.dataset['valid_file'],
            self.feat2index,
            vbsz,
            tgt_key=tgt_key
        )
        self.test_data = None
        if 'test_file' in self.dataset:
            self.test_data = self.reader.load(
                self.dataset['test_file'],
                self.feat2index,
                1,
                tgt_key=tgt_key
            )

    def _create_model(self):

        model = self.config_params['model']
        unif = self.config_params.get('unif', 0.1)
        model['unif'] = model.get('unif', unif)
        model['batchsz'] = self.config_params['batchsz']
        model['tgt_key'] = self.config_params.get('reader',
                                                  self.config_params.get('loader', {})).get('tgt_key', self.primary_key)
        model['src_keys'] = listify(self.config_params.get('reader', list(self.config_params.get('loader', {}).get('src_keys', self.embeddings.keys()))))
        if self.backend.params is not None:
            for k, v in self.backend.params.items():
                model[k] = v
        return baseline.model.create_lang_model(self.embeddings, **model)

    def train(self, checkpoint=None):
        self._load_dataset()
        if self.config_params['train'].get('lr_scheduler_type', None) == 'zaremba':
            first_range = int(self.config_params['train']['start_decay_epoch'] * self.train_data.steps)
            self.config_params['train']['bounds'] = [first_range] + list(
                np.arange(
                    self.config_params['train']['start_decay_epoch'] + 1,
                    self.config_params['train']['epochs'] + 1,
                    dtype=np.int32
                ) * self.train_data.steps
            )
        baseline.save_vectorizers(self.get_basedir(), self.vectorizers)
        model = self._create_model()
        train_params = self.config_params['train']
        train_params['checkpoint'] = checkpoint
        metrics = baseline.train.fit(model, self.train_data, self.valid_data, self.test_data, **train_params)
        baseline.zip_files(self.get_basedir())
        self._close_reporting_hooks()
        return model, metrics

    @staticmethod
    def _num_steps_per_epoch(num_examples, nctx, batchsz):
        rest = num_examples // batchsz
        return rest // nctx
Example #2
0
class TaggerTask(Task):

    def __init__(self, mead_settings_config, **kwargs):
        super(TaggerTask, self).__init__(mead_settings_config, **kwargs)

    @classmethod
    def task_name(cls):
        return 'tagger'

    def _create_backend(self, **kwargs):
        backend = Backend(self.config_params.get('backend', 'tf'))
        if 'preproc' not in self.config_params:
            self.config_params['preproc'] = {}
        if backend.name == 'pytorch':
            self.config_params['preproc']['trim'] = True
        elif backend.name == 'dy':
            import _dynet
            dy_params = _dynet.DynetParams()
            dy_params.from_args()
            dy_params.set_requested_gpus(1)
            if 'autobatchsz' in self.config_params['train']:
                dy_params.set_autobatch(True)
            else:
                raise Exception('Tagger currently only supports autobatching.'
                                'Change "batchsz" to 1 and under "train", set "autobatchsz" to your desired batchsz')
            dy_params.init()
            backend.params = {'pc': _dynet.ParameterCollection(), 'batched': False}
            self.config_params['preproc']['trim'] = True
        else:
            self.config_params['preproc']['trim'] = False

        backend.load(self.task_name())

        return backend

    def initialize(self, embeddings):
        self.dataset = DataDownloader(self.dataset, self.data_download_cache).download()
        print_dataset_info(self.dataset)
        embeddings = read_config_file_or_json(embeddings, 'embeddings')
        embeddings_set = index_by_label(embeddings)
        vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']]
        # TODO: make this optional
        if 'test_file' in self.dataset:
            vocab_sources.append(self.dataset['test_file'])

        vocabs = self.reader.build_vocab(vocab_sources, min_f=Task._get_min_f(self.config_params),
                                         vocab_file
                                         =self.dataset.get('vocab_file'))
        self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocabs, self.config_params['features'])
        baseline.save_vocabs(self.get_basedir(), self.feat2index)

    def _create_model(self):
        labels = self.reader.label2index
        span_type = self.config_params['train'].get('span_type')
        constrain = bool(self.config_params['model'].get('constrain_decode', False))
        if span_type is None and constrain:
            logger.warning("Constrained Decoding was set but no span type could be found so no Constraints will be applied.")
        self.config_params['model']['span_type'] = span_type
        if span_type is not None and constrain:
            self.config_params['model']['constraint'] = self.backend.transition_mask(
                labels, span_type, Offsets.GO, Offsets.EOS, Offsets.PAD
            )

        model = self.config_params['model']
        unif = self.config_params.get('unif', 0.1)
        model['unif'] = model.get('unif', unif)

        lengths_key = model.get('lengths_key', self.primary_key)
        if lengths_key is not None:
            if not lengths_key.endswith('_lengths'):
                lengths_key = '{}_lengths'.format(lengths_key)
            model['lengths_key'] = lengths_key

        if self.backend.params is not None:
            for k, v in self.backend.params.items():
                model[k] = v
        return baseline.model.create_tagger_model(self.embeddings, labels, **self.config_params['model'])

    def _load_dataset(self):
        # TODO: get rid of sort_key=self.primary_key in favor of something explicit?
        bsz, vbsz, tbsz = Task._get_batchsz(self.config_params)
        self.train_data, _ = self.reader.load(
            self.dataset['train_file'],
            self.feat2index,
            bsz,
            shuffle=True,
            sort_key='{}_lengths'.format(self.primary_key)
        )
        self.valid_data, _ = self.reader.load(
            self.dataset['valid_file'],
            self.feat2index,
            vbsz,
            sort_key=None
        )
        self.test_data = None
        self.txts = None
        if 'test_file' in self.dataset:
            self.test_data, self.txts = self.reader.load(
                self.dataset['test_file'],
                self.feat2index,
                tbsz,
                shuffle=False,
                sort_key=None
            )


    def train(self, checkpoint=None):
        self._load_dataset()
        baseline.save_vectorizers(self.get_basedir(), self.vectorizers)
        model = self._create_model()
        conll_output = self.config_params.get("conll_output", None)
        train_params = self.config_params['train']
        train_params['checkpoint'] = checkpoint
        metrics = baseline.train.fit(model, self.train_data, self.valid_data, self.test_data,
                           conll_output=conll_output,
                           txts=self.txts, **train_params)
        baseline.zip_files(self.get_basedir())
        self._close_reporting_hooks()
        return model, metrics
Example #3
0
class EncoderDecoderTask(Task):

    def __init__(self, mead_settings_config, **kwargs):
        super(EncoderDecoderTask, self).__init__(mead_settings_config, **kwargs)

    @classmethod
    def task_name(cls):
        return 'seq2seq'

    def _create_backend(self, **kwargs):
        backend = Backend(self.config_params.get('backend', 'tf'))
        if 'preproc' not in self.config_params:
            self.config_params['preproc'] = {}
        self.config_params['preproc']['show_ex'] = show_examples
        if backend.name == 'pytorch':
            self.config_params['preproc']['trim'] = True
        elif backend.name == 'dy':
            import _dynet
            dy_params = _dynet.DynetParams()
            dy_params.from_args()
            dy_params.set_requested_gpus(1)
            if 'autobatchsz' in self.config_params['train']:
                self.config_params['train']['trainer_type'] = 'autobatch'
                dy_params.set_autobatch(True)
                batched = False
            else:
                batched = True
            dy_params.init()
            backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched}
            self.config_params['preproc']['trim'] = True
        else:
            self.config_params['preproc']['trim'] = True
        backend.load(self.task_name())

        return backend

    def initialize(self, embeddings):
        embeddings = read_config_file_or_json(embeddings, 'embeddings')
        embeddings_set = index_by_label(embeddings)
        self.dataset = DataDownloader(self.dataset, self.data_download_cache).download()
        print_dataset_info(self.dataset)
        vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']]
        # TODO: make this optional
        if 'test_file' in self.dataset:
            vocab_sources.append(self.dataset['test_file'])
        vocab1, vocab2 = self.reader.build_vocabs(vocab_sources,
                                                  min_f=Task._get_min_f(self.config_params),
                                                  vocab_file=self.dataset.get('vocab_file'))

        # To keep the config file simple, share a list between source and destination (tgt)
        features_src = []
        features_tgt = None
        for feature in self.config_params['features']:
            if feature['name'] == 'tgt':
                features_tgt = feature
            else:
                features_src += [feature]

        self.src_embeddings, self.feat2src = self._create_embeddings(embeddings_set, vocab1, features_src)
        # For now, dont allow multiple vocabs of output
        baseline.save_vocabs(self.get_basedir(), self.feat2src)
        self.tgt_embeddings, self.feat2tgt = self._create_embeddings(embeddings_set, {'tgt': vocab2}, [features_tgt])
        baseline.save_vocabs(self.get_basedir(), self.feat2tgt)
        self.tgt_embeddings = self.tgt_embeddings['tgt']
        self.feat2tgt = self.feat2tgt['tgt']

    def _load_dataset(self):
        bsz, vbsz, tbsz = Task._get_batchsz(self.config_params)
        self.train_data = self.reader.load(
            self.dataset['train_file'],
            self.feat2src, self.feat2tgt,
            bsz,
            shuffle=True,
            sort_key='{}_lengths'.format(self.primary_key)
        )

        self.valid_data = self.reader.load(
            self.dataset['valid_file'],
            self.feat2src, self.feat2tgt,
            vbsz,
            shuffle=True
        )
        self.test_data = None
        if 'test_file' in self.dataset:
            self.test_data = self.reader.load(
                self.dataset['test_file'],
                self.feat2src, self.feat2tgt,
                tbsz,
            )


    def _create_model(self):
        self.config_params['model']["unif"] = self.config_params["unif"]
        model = self.config_params['model']
        unif = self.config_params.get('unif', 0.1)
        model['unif'] = model.get('unif', unif)
        lengths_key = model.get('src_lengths_key', self.primary_key)
        if lengths_key is not None:
            if not lengths_key.endswith('_lengths'):
                lengths_key = '{}_lengths'.format(lengths_key)
            model['src_lengths_key'] = lengths_key
        if self.backend.params is not None:
            for k, v in self.backend.params.items():
                model[k] = v
        return baseline.model.create_seq2seq_model(self.src_embeddings, self.tgt_embeddings, **self.config_params['model'])

    def train(self, checkpoint=None):

        num_ex = self.config_params['num_valid_to_show']

        rlut1 = revlut(self.feat2src[self.primary_key])
        rlut2 = revlut(self.feat2tgt)
        if num_ex > 0:
            logger.info('Showing examples')
            preproc = self.config_params.get('preproc', {})
            show_ex_fn = preproc['show_ex']
            self.config_params['train']['after_train_fn'] = lambda model: show_ex_fn(model,
                                                                                     self.valid_data, rlut1, rlut2,
                                                                                     self.feat2tgt,
                                                                                     preproc['mxlen'], False, 0,
                                                                                     num_ex, reverse=False)
        self.config_params['train']['tgt_rlut'] = rlut2
        return super(EncoderDecoderTask, self).train(checkpoint)
Example #4
0
class EncoderDecoderTask(Task):
    def __init__(self, logging_file, mead_config, **kwargs):
        super(EncoderDecoderTask, self).__init__(logging_file, mead_config,
                                                 **kwargs)
        self.task = None

    def _create_task_specific_reader(self):
        preproc = self.config_params['preproc']
        reader = baseline.create_parallel_corpus_reader(
            preproc['mxlen'], preproc['vec_alloc'], preproc['trim'],
            preproc['word_trans_fn'], **self.config_params['loader'])
        return reader

    def _setup_task(self):

        # If its not vanilla seq2seq, dont bother reversing
        do_reverse = self.config_params['model']['model_type'] == 'default'
        backend = self.config_params.get('backend', 'tensorflow')
        if backend == 'pytorch':
            print('PyTorch backend')
            from baseline.pytorch import long_0_tensor_alloc as vec_alloc
            from baseline.pytorch import tensor_shape as vec_shape
            from baseline.pytorch import tensor_reverse_2nd as rev2nd
            import baseline.pytorch.seq2seq as seq2seq
            self.config_params['preproc']['vec_alloc'] = vec_alloc
            self.config_params['preproc']['vec_shape'] = vec_shape
            src_vec_trans = rev2nd if do_reverse else None
            self.config_params['preproc']['word_trans_fn'] = src_vec_trans
            self.config_params['preproc'][
                'show_ex'] = baseline.pytorch.show_examples_pytorch
            self.config_params['preproc']['trim'] = True
        else:
            import baseline.tf.seq2seq as seq2seq
            import mead.tf
            self.ExporterType = mead.tf.Seq2SeqTensorFlowExporter
            self.config_params['preproc']['vec_alloc'] = np.zeros
            self.config_params['preproc']['vec_shape'] = np.shape
            self.config_params['preproc']['trim'] = False
            src_vec_trans = baseline.reverse_2nd if do_reverse else None
            self.config_params['preproc']['word_trans_fn'] = src_vec_trans
            self.config_params['preproc'][
                'show_ex'] = baseline.tf.show_examples_tf

        self.task = seq2seq

    def initialize(self, embeddings):
        embeddings_set = mead.utils.index_by_label(embeddings)
        self.dataset = DataDownloader(self.dataset, self.data_download_cache,
                                      True).download()
        print(
            "[train file]: {}\n[valid file]: {}\n[test file]: {}\n[vocab file]: {}"
            .format(self.dataset['train_file'], self.dataset['valid_file'],
                    self.dataset['test_file'],
                    self.dataset.get('vocab_file', "None")))
        vocab_file = self.dataset.get('vocab_file', None)
        if vocab_file is not None:
            vocab1, vocab2 = self.reader.build_vocabs([vocab_file])
        else:
            vocab1, vocab2 = self.reader.build_vocabs([
                self.dataset['train_file'], self.dataset['valid_file'],
                self.dataset['test_file']
            ])
        self.embeddings1, self.feat2index1 = self._create_embeddings(
            embeddings_set, {'word': vocab1})
        self.embeddings2, self.feat2index2 = self._create_embeddings(
            embeddings_set, {'word': vocab2})

    def _load_dataset(self):
        self.train_data = self.reader.load(self.dataset['train_file'],
                                           self.feat2index1['word'],
                                           self.feat2index2['word'],
                                           self.config_params['batchsz'],
                                           shuffle=True)
        self.valid_data = self.reader.load(self.dataset['valid_file'],
                                           self.feat2index1['word'],
                                           self.feat2index2['word'],
                                           self.config_params['batchsz'],
                                           shuffle=True)
        self.test_data = self.reader.load(
            self.dataset['test_file'],
            self.feat2index1['word'], self.feat2index2['word'],
            self.config_params.get('test_batchsz', 1))

    def _create_model(self):
        return self.task.create_model(self.embeddings1['word'],
                                      self.embeddings2['word'],
                                      **self.config_params['model'])

    def train(self):

        num_ex = self.config_params['num_valid_to_show']

        if num_ex > 0:
            print('Showing examples')
            preproc = self.config_params['preproc']
            show_ex_fn = preproc['show_ex']
            rlut1 = baseline.revlut(self.feat2index1['word'])
            rlut2 = baseline.revlut(self.feat2index2['word'])
            self.config_params['train'][
                'after_train_fn'] = lambda model: show_ex_fn(model,
                                                             self.valid_data,
                                                             rlut1,
                                                             rlut2,
                                                             self.embeddings2[
                                                                 'word'],
                                                             preproc['mxlen'],
                                                             False,
                                                             0,
                                                             num_ex,
                                                             reverse=False)
        super(EncoderDecoderTask, self).train()
Example #5
0
class ClassifierTask(Task):

    def __init__(self, mead_settings_config, **kwargs):
        super(ClassifierTask, self).__init__(mead_settings_config, **kwargs)

    @classmethod
    def task_name(cls):
        return 'classify'

    def _create_backend(self, **kwargs):
        backend = Backend(self.config_params.get('backend', 'tf'))
        if backend.name == 'dy':
            import _dynet
            dy_params = _dynet.DynetParams()
            dy_params.from_args()
            dy_params.set_requested_gpus(1)
            if 'autobatchsz' in self.config_params['train']:
                self.config_params['train']['trainer_type'] = 'autobatch'
                dy_params.set_autobatch(True)
                batched = False
            else:
                batched = True
            dy_params.init()
            backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched}

        backend.load(self.task_name())

        return backend

    def _setup_task(self, **kwargs):
        super(ClassifierTask, self)._setup_task(**kwargs)
        if self.config_params.get('preproc', {}).get('clean', False) is True:
            self.config_params.get('preproc', {})['clean_fn'] = baseline.TSVSeqLabelReader.do_clean
            logger.info('Clean')
        else:
            self.config_params.setdefault('preproc', {})
            self.config_params['preproc']['clean_fn'] = None

    def initialize(self, embeddings):
        embeddings = read_config_file_or_json(embeddings, 'embeddings')
        embeddings_set = index_by_label(embeddings)
        self.dataset = DataDownloader(self.dataset, self.data_download_cache).download()
        print_dataset_info(self.dataset)

        vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']]
        # TODO: make this optional
        if 'test_file' in self.dataset:
            vocab_sources.append(self.dataset['test_file'])

        vocab, self.labels = self.reader.build_vocab(vocab_sources,
                                                     min_f=Task._get_min_f(self.config_params),
                                                     vocab_file=self.dataset.get('vocab_file'),
                                                     label_file=self.dataset.get('label_file'))
        self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocab, self.config_params['features'])
        baseline.save_vocabs(self.get_basedir(), self.feat2index)

    def _create_model(self):
        unif = self.config_params.get('unif', 0.1)
        model = self.config_params['model']
        model['unif'] = model.get('unif', unif)
        lengths_key = model.get('lengths_key', self.primary_key)
        if lengths_key is not None:
            if not lengths_key.endswith('_lengths'):
                lengths_key = '{}_lengths'.format(lengths_key)
            model['lengths_key'] = lengths_key
        if self.backend.params is not None:
            for k, v in self.backend.params.items():
                model[k] = v
        return baseline.model.create_model(self.embeddings, self.labels, **model)

    def _load_dataset(self):
        read = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader']
        sort_key = read.get('sort_key')
        bsz, vbsz, tbsz = Task._get_batchsz(self.config_params)
        self.train_data = self.reader.load(
            self.dataset['train_file'],
            self.feat2index,
            bsz,
            shuffle=True,
            sort_key=sort_key,
        )
        self.valid_data = self.reader.load(
            self.dataset['valid_file'],
            self.feat2index,
            vbsz,
        )
        self.test_data = None
        if 'test_file' in self.dataset:
            self.test_data = self.reader.load(
                self.dataset['test_file'],
                self.feat2index,
                tbsz,
            )
Example #6
0
class LanguageModelingTask(Task):
    def __init__(self, logging_config, mead_settings_config, **kwargs):
        super(LanguageModelingTask,
              self).__init__(logging_config, mead_settings_config, **kwargs)

    @classmethod
    def task_name(cls):
        return 'lm'

    def _create_task_specific_reader(self):
        self._create_vectorizers()

        reader_params = self.config_params['loader']
        reader_params['nctx'] = reader_params.get(
            'nctx',
            self.config_params.get('nctx', self.config_params.get('nbptt',
                                                                  35)))
        reader_params['clean_fn'] = reader_params.get(
            'clean_fn',
            self.config_params.get('preproc', {}).get('clean_fn'))
        reader_params['mxlen'] = self.vectorizers[self.primary_key].mxlen
        if self.config_params['model'].get('gpus', 1) > 1:
            reader_params['truncate'] = True
        return baseline.reader.create_reader(
            self.task_name(), self.vectorizers,
            self.config_params['preproc'].get('trim', False), **reader_params)

    def _create_backend(self, **kwargs):
        backend = Backend(self.config_params.get('backend', 'tf'))

        if backend.name == 'pytorch':
            self.config_params.get('preproc', {})['trim'] = True

        elif backend.name == 'dy':
            self.config_params.get('preproc', {})['trim'] = True
            import _dynet
            dy_params = _dynet.DynetParams()
            dy_params.from_args()
            dy_params.set_requested_gpus(1)
            if 'autobatchsz' in self.config_params['train']:
                dy_params.set_autobatch(True)
                batched = False
            else:
                batched = True
            dy_params.init()
            backend.params = {
                'pc': _dynet.ParameterCollection(),
                'batched': batched
            }

        backend.load(self.task_name())
        return backend

    def initialize(self, embeddings):
        embeddings = read_config_file_or_json(embeddings, 'embeddings')
        embeddings_set = index_by_label(embeddings)
        self.dataset = DataDownloader(self.dataset,
                                      self.data_download_cache).download()
        print_dataset_info(self.dataset)
        vocabs = self.reader.build_vocab(
            [
                self.dataset['train_file'], self.dataset['valid_file'],
                self.dataset['test_file']
            ],
            min_f=Task._get_min_f(self.config_params),
            vocab_file=self.dataset.get('vocab_file'))
        self.embeddings, self.feat2index = self._create_embeddings(
            embeddings_set, vocabs, self.config_params['features'])
        baseline.save_vocabs(self.get_basedir(), self.feat2index)

    def _load_dataset(self):
        tgt_key = self.config_params['loader'].get('tgt_key', self.primary_key)
        self.train_data = self.reader.load(self.dataset['train_file'],
                                           self.feat2index,
                                           self.config_params['batchsz'],
                                           tgt_key=tgt_key)
        self.valid_data = self.reader.load(self.dataset['valid_file'],
                                           self.feat2index,
                                           self.config_params.get(
                                               'valid_batchsz',
                                               self.config_params['batchsz']),
                                           tgt_key=tgt_key)
        self.test_data = self.reader.load(self.dataset['test_file'],
                                          self.feat2index,
                                          1,
                                          tgt_key=tgt_key)

    def _create_model(self):

        model = self.config_params['model']
        unif = self.config_params.get('unif', 0.1)
        model['unif'] = model.get('unif', unif)
        model['batchsz'] = self.config_params['batchsz']
        model['tgt_key'] = self.config_params['loader'].get(
            'tgt_key', self.primary_key)
        if self.backend.params is not None:
            for k, v in self.backend.params.items():
                model[k] = v
        return baseline.model.create_lang_model(self.embeddings, **model)

    def train(self):
        self._load_dataset()
        if self.config_params['train'].get('lr_scheduler_type',
                                           None) == 'zaremba':
            first_range = int(
                self.config_params['train']['start_decay_epoch'] *
                self.train_data.steps)
            self.config_params['train']['bounds'] = [first_range] + list(
                np.arange(self.config_params['train']['start_decay_epoch'] + 1,
                          self.config_params['train']['epochs'] + 1,
                          dtype=np.int32) * self.train_data.steps)
        baseline.save_vectorizers(self.get_basedir(), self.vectorizers)
        model = self._create_model()
        baseline.train.fit(model, self.train_data, self.valid_data,
                           self.test_data, **self.config_params['train'])
        baseline.zip_files(self.get_basedir())
        self._close_reporting_hooks()

    @staticmethod
    def _num_steps_per_epoch(num_examples, nctx, batchsz):
        rest = num_examples // batchsz
        return rest // nctx
Example #7
0
class ClassifierTask(Task):
    def __init__(self, logging_config, mead_settings_config, **kwargs):
        super(ClassifierTask, self).__init__(logging_config,
                                             mead_settings_config, **kwargs)

    @classmethod
    def task_name(cls):
        return 'classify'

    def _create_backend(self, **kwargs):
        backend = Backend(self.config_params.get('backend', 'tf'))
        if backend.name == 'dy':
            import _dynet
            dy_params = _dynet.DynetParams()
            dy_params.from_args()
            dy_params.set_requested_gpus(1)
            if 'autobatchsz' in self.config_params['train']:
                self.config_params['train']['trainer_type'] = 'autobatch'
                dy_params.set_autobatch(True)
                batched = False
            else:
                batched = True
            dy_params.init()
            backend.params = {
                'pc': _dynet.ParameterCollection(),
                'batched': batched
            }
        elif backend.name == 'tf':
            # FIXME this should be registered as well!
            exporter_type = kwargs.get('exporter_type', 'default')
            if exporter_type == 'default':
                from mead.tf.exporters import ClassifyTensorFlowExporter
                backend.exporter = ClassifyTensorFlowExporter
            elif exporter_type == 'preproc':
                from mead.tf.preproc_exporters import ClassifyTensorFlowPreProcExporter
                import mead.tf.preprocessors
                backend.exporter = ClassifyTensorFlowPreProcExporter

        backend.load(self.task_name())

        return backend

    def _setup_task(self, **kwargs):
        super(ClassifierTask, self)._setup_task(**kwargs)
        if self.config_params.get('preproc', {}).get('clean', False) is True:
            self.config_params.get(
                'preproc',
                {})['clean_fn'] = baseline.TSVSeqLabelReader.do_clean
            print('Clean')
        else:
            self.config_params['preproc'] = {}
            self.config_params['preproc']['clean_fn'] = None

    def initialize(self, embeddings):
        embeddings = read_config_file_or_json(embeddings, 'embeddings')
        embeddings_set = index_by_label(embeddings)
        self.dataset = DataDownloader(self.dataset,
                                      self.data_download_cache).download()
        print_dataset_info(self.dataset)
        vocab, self.labels = self.reader.build_vocab(
            [
                self.dataset['train_file'], self.dataset['valid_file'],
                self.dataset['test_file']
            ],
            min_f=Task._get_min_f(self.config_params),
            vocab_file=self.dataset.get('vocab_file'),
            label_file=self.dataset.get('label_file'))
        self.embeddings, self.feat2index = self._create_embeddings(
            embeddings_set, vocab, self.config_params['features'])
        baseline.save_vocabs(self.get_basedir(), self.feat2index)

    def _create_model(self):
        unif = self.config_params.get('unif', 0.1)
        model = self.config_params['model']
        model['unif'] = model.get('unif', unif)
        lengths_key = model.get('lengths_key', self.primary_key)
        if lengths_key is not None:
            if not lengths_key.endswith('_lengths'):
                lengths_key = '{}_lengths'.format(lengths_key)
            model['lengths_key'] = lengths_key
        if self.backend.params is not None:
            for k, v in self.backend.params.items():
                model[k] = v
        return baseline.model.create_model(self.embeddings, self.labels,
                                           **model)

    def _load_dataset(self):
        self.train_data = self.reader.load(
            self.dataset['train_file'],
            self.feat2index,
            self.config_params['batchsz'],
            shuffle=True,
            sort_key=self.config_params['loader'].get('sort_key'))
        self.valid_data = self.reader.load(
            self.dataset['valid_file'], self.feat2index,
            self.config_params.get('valid_batchsz',
                                   self.config_params['batchsz']))
        self.test_data = self.reader.load(
            self.dataset['test_file'], self.feat2index,
            self.config_params.get('test_batchsz', 1))
Example #8
0
class LanguageModelingTask(Task):

    def __init__(self, mead_settings_config, **kwargs):
        super(LanguageModelingTask, self).__init__(mead_settings_config, **kwargs)

    @classmethod
    def task_name(cls):
        return 'lm'

    def _create_task_specific_reader(self):
        self._create_vectorizers()

        reader_params = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader']
        reader_params['nctx'] = reader_params.get('nctx', self.config_params.get('nctx', self.config_params.get('nbptt', 35)))
        reader_params['clean_fn'] = reader_params.get('clean_fn', self.config_params.get('preproc', {}).get('clean_fn'))
        if reader_params['clean_fn'] is not None and self.config_params['dataset'] != 'SST2':
            logger.warning('Warning: A reader preprocessing function (%s) is active, it is recommended that all data preprocessing is done outside of baseline to insure data at inference time matches data at training time.', reader_params['clean_fn'])
        reader_params['mxlen'] = self.vectorizers[self.primary_key].mxlen
        if self.config_params['model'].get('gpus', 1) > 1:
            reader_params['truncate'] = True
        return baseline.reader.create_reader(self.task_name(), self.vectorizers, self.config_params['preproc'].get('trim', False), **reader_params)

    def _create_backend(self, **kwargs):
        backend = Backend(self.config_params.get('backend', 'tf'))

        if backend.name == 'pytorch':
            self.config_params.get('preproc', {})['trim'] = True

        elif backend.name == 'dy':
            self.config_params.get('preproc', {})['trim'] = True
            import _dynet
            dy_params = _dynet.DynetParams()
            dy_params.from_args()
            dy_params.set_requested_gpus(1)
            if 'autobatchsz' in self.config_params['train']:
                dy_params.set_autobatch(True)
                batched = False
            else:
                batched = True
            dy_params.init()
            backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched}

        backend.load(self.task_name())
        return backend

    def initialize(self, embeddings):
        embeddings = read_config_file_or_json(embeddings, 'embeddings')
        embeddings_set = index_by_label(embeddings)
        self.dataset = DataDownloader(self.dataset, self.data_download_cache).download()
        print_dataset_info(self.dataset)
        vocabs = self.reader.build_vocab(
            [self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file']],
            min_f=Task._get_min_f(self.config_params),
            vocab_file=self.dataset.get('vocab_file')
        )
        self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocabs, self.config_params['features'])
        baseline.save_vocabs(self.get_basedir(), self.feat2index)

    def _load_dataset(self):
        read = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader']
        tgt_key = read.get('tgt_key', self.primary_key)
        bsz, vbsz, tbsz = Task._get_batchsz(self.config_params)
        self.train_data = self.reader.load(
            self.dataset['train_file'],
            self.feat2index,
            bsz,
            tgt_key=tgt_key
        )
        self.valid_data = self.reader.load(
            self.dataset['valid_file'],
            self.feat2index,
            vbsz,
            tgt_key=tgt_key
        )
        self.test_data = self.reader.load(
            self.dataset['test_file'],
            self.feat2index,
            1,
            tgt_key=tgt_key
        )

    def _create_model(self):

        model = self.config_params['model']
        unif = self.config_params.get('unif', 0.1)
        model['unif'] = model.get('unif', unif)
        model['batchsz'] = self.config_params['batchsz']
        model['tgt_key'] = self.config_params.get('reader', self.config_params.get('loader', {})).get('tgt_key', self.primary_key)
        if self.backend.params is not None:
            for k, v in self.backend.params.items():
                model[k] = v
        return baseline.model.create_lang_model(self.embeddings, **model)

    def train(self, checkpoint=None):
        self._load_dataset()
        if self.config_params['train'].get('lr_scheduler_type', None) == 'zaremba':
            first_range = int(self.config_params['train']['start_decay_epoch'] * self.train_data.steps)
            self.config_params['train']['bounds'] = [first_range] + list(
                np.arange(
                    self.config_params['train']['start_decay_epoch'] + 1,
                    self.config_params['train']['epochs'] + 1,
                    dtype=np.int32
                ) * self.train_data.steps
            )
        baseline.save_vectorizers(self.get_basedir(), self.vectorizers)
        model = self._create_model()
        train_params = self.config_params['train']
        train_params['checkpoint'] = checkpoint
        metrics = baseline.train.fit(model, self.train_data, self.valid_data, self.test_data, **train_params)
        baseline.zip_files(self.get_basedir())
        self._close_reporting_hooks()
        return model, metrics

    @staticmethod
    def _num_steps_per_epoch(num_examples, nctx, batchsz):
        rest = num_examples // batchsz
        return rest // nctx
Example #9
0
class EncoderDecoderTask(Task):

    def __init__(self, mead_settings_config, **kwargs):
        super(EncoderDecoderTask, self).__init__(mead_settings_config, **kwargs)

    @classmethod
    def task_name(cls):
        return 'seq2seq'

    def _create_backend(self, **kwargs):
        backend = Backend(self.config_params.get('backend', 'tf'))
        if 'preproc' not in self.config_params:
            self.config_params['preproc'] = {}
        self.config_params['preproc']['show_ex'] = show_examples
        if backend.name == 'pytorch':
            self.config_params['preproc']['trim'] = True
        elif backend.name == 'dy':
            import _dynet
            dy_params = _dynet.DynetParams()
            dy_params.from_args()
            dy_params.set_requested_gpus(1)
            if 'autobatchsz' in self.config_params['train']:
                self.config_params['train']['trainer_type'] = 'autobatch'
                dy_params.set_autobatch(True)
                batched = False
            else:
                batched = True
            dy_params.init()
            backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched}
            self.config_params['preproc']['trim'] = True
        else:
            self.config_params['preproc']['trim'] = True
        backend.load(self.task_name())

        return backend

    def initialize(self, embeddings):
        embeddings = read_config_file_or_json(embeddings, 'embeddings')
        embeddings_set = index_by_label(embeddings)
        self.dataset = DataDownloader(self.dataset, self.data_download_cache).download()
        print_dataset_info(self.dataset)
        vocab1, vocab2 = self.reader.build_vocabs(
            [self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file']],
            min_f=Task._get_min_f(self.config_params),
            vocab_file=self.dataset.get('vocab_file')
        )

        # To keep the config file simple, share a list between source and destination (tgt)
        features_src = []
        features_tgt = None
        for feature in self.config_params['features']:
            if feature['name'] == 'tgt':
                features_tgt = feature
            else:
                features_src += [feature]

        self.src_embeddings, self.feat2src = self._create_embeddings(embeddings_set, vocab1, features_src)
        # For now, dont allow multiple vocabs of output
        baseline.save_vocabs(self.get_basedir(), self.feat2src)
        self.tgt_embeddings, self.feat2tgt = self._create_embeddings(embeddings_set, {'tgt': vocab2}, [features_tgt])
        baseline.save_vocabs(self.get_basedir(), self.feat2tgt)
        self.tgt_embeddings = self.tgt_embeddings['tgt']
        self.feat2tgt = self.feat2tgt['tgt']

    def _load_dataset(self):
        bsz, vbsz, tbsz = Task._get_batchsz(self.config_params)
        self.train_data = self.reader.load(
            self.dataset['train_file'],
            self.feat2src, self.feat2tgt,
            bsz,
            shuffle=True,
            sort_key='{}_lengths'.format(self.primary_key)
        )

        self.valid_data = self.reader.load(
            self.dataset['valid_file'],
            self.feat2src, self.feat2tgt,
            vbsz,
            shuffle=True
        )
        self.test_data = self.reader.load(
            self.dataset['test_file'],
            self.feat2src, self.feat2tgt,
            tbsz,
        )

    def _create_model(self):
        self.config_params['model']["unif"] = self.config_params["unif"]
        model = self.config_params['model']
        unif = self.config_params.get('unif', 0.1)
        model['unif'] = model.get('unif', unif)
        lengths_key = model.get('src_lengths_key', self.primary_key)
        if lengths_key is not None:
            if not lengths_key.endswith('_lengths'):
                lengths_key = '{}_lengths'.format(lengths_key)
            model['src_lengths_key'] = lengths_key
        if self.backend.params is not None:
            for k, v in self.backend.params.items():
                model[k] = v
        return baseline.model.create_seq2seq_model(self.src_embeddings, self.tgt_embeddings, **self.config_params['model'])

    def train(self, checkpoint=None):

        num_ex = self.config_params['num_valid_to_show']

        rlut1 = revlut(self.feat2src[self.primary_key])
        rlut2 = revlut(self.feat2tgt)
        if num_ex > 0:
            logger.info('Showing examples')
            preproc = self.config_params.get('preproc', {})
            show_ex_fn = preproc['show_ex']
            self.config_params['train']['after_train_fn'] = lambda model: show_ex_fn(model,
                                                                                     self.valid_data, rlut1, rlut2,
                                                                                     self.feat2tgt,
                                                                                     preproc['mxlen'], False, 0,
                                                                                     num_ex, reverse=False)
        self.config_params['train']['tgt_rlut'] = rlut2
        return super(EncoderDecoderTask, self).train(checkpoint)
Example #10
0
class ClassifierTask(Task):

    def __init__(self, mead_settings_config, **kwargs):
        super(ClassifierTask, self).__init__(mead_settings_config, **kwargs)

    @classmethod
    def task_name(cls):
        return 'classify'

    def _create_backend(self, **kwargs):
        backend = Backend(self.config_params.get('backend', 'tf'))
        if backend.name == 'dy':
            import _dynet
            dy_params = _dynet.DynetParams()
            dy_params.from_args()
            dy_params.set_requested_gpus(1)
            if 'autobatchsz' in self.config_params['train']:
                self.config_params['train']['trainer_type'] = 'autobatch'
                dy_params.set_autobatch(True)
                batched = False
            else:
                batched = True
            dy_params.init()
            backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched}

        backend.load(self.task_name())

        return backend

    def _setup_task(self, **kwargs):
        super(ClassifierTask, self)._setup_task(**kwargs)
        if self.config_params.get('preproc', {}).get('clean', False) is True:
            self.config_params.get('preproc', {})['clean_fn'] = baseline.TSVSeqLabelReader.do_clean
            logger.info('Clean')
        else:
            self.config_params['preproc'] = {}
            self.config_params['preproc']['clean_fn'] = None

    def initialize(self, embeddings):
        embeddings = read_config_file_or_json(embeddings, 'embeddings')
        embeddings_set = index_by_label(embeddings)
        self.dataset = DataDownloader(self.dataset, self.data_download_cache).download()
        print_dataset_info(self.dataset)
        vocab, self.labels = self.reader.build_vocab(
            [self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file']],
            min_f=Task._get_min_f(self.config_params),
            vocab_file=self.dataset.get('vocab_file'),
            label_file=self.dataset.get('label_file')
        )
        self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocab, self.config_params['features'])
        baseline.save_vocabs(self.get_basedir(), self.feat2index)

    def _create_model(self):
        unif = self.config_params.get('unif', 0.1)
        model = self.config_params['model']
        model['unif'] = model.get('unif', unif)
        lengths_key = model.get('lengths_key', self.primary_key)
        if lengths_key is not None:
            if not lengths_key.endswith('_lengths'):
                lengths_key = '{}_lengths'.format(lengths_key)
            model['lengths_key'] = lengths_key
        if self.backend.params is not None:
            for k, v in self.backend.params.items():
                model[k] = v
        return baseline.model.create_model(self.embeddings, self.labels, **model)

    def _load_dataset(self):
        read = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader']
        sort_key = read.get('sort_key')
        bsz, vbsz, tbsz = Task._get_batchsz(self.config_params)
        self.train_data = self.reader.load(
            self.dataset['train_file'],
            self.feat2index,
            bsz,
            shuffle=True,
            sort_key=sort_key,
        )
        self.valid_data = self.reader.load(
            self.dataset['valid_file'],
            self.feat2index,
            vbsz,
        )
        self.test_data = self.reader.load(
            self.dataset['test_file'],
            self.feat2index,
            tbsz,
        )