Beispiel #1
0
 def __init__(self,
              vocabs=None,
              vectorizers=None,
              model=None,
              preproc='client'):
     super().__init__(vocabs, vectorizers, model, preproc)
     # The model always returns indices (no need for `return_labels`)
     self.label_vocab = revlut(self.get_labels())
     self.rev_vocab = {k: revlut(v) for k, v in self.vocabs.items()}
Beispiel #2
0
 def __init__(self,
              vocabs=None,
              vectorizers=None,
              model=None,
              preproc='client'):
     super().__init__(vocabs, vectorizers, model, preproc)
     if hasattr(self.model, 'return_labels'):
         self.return_labels = self.model.return_labels
     else:
         self.return_labels = False  # keeping the default tagger behavior
     if not self.return_labels:
         self.label_vocab = revlut(self.get_labels())
     self.rev_vocab = {k: revlut(v) for k, v in self.vocabs.items()}
Beispiel #3
0
 def __init__(self, model, span_type, verbose):
     self.model = model
     self.idx2label = revlut(model.labels)
     self.span_type = span_type
     if verbose:
         logger.info('Setting span type %s', self.span_type)
     self.verbose = verbose
Beispiel #4
0
 def __init__(self, model, span_type, verbose):
     self.model = model
     self.idx2label = revlut(model.labels)
     self.span_type = span_type
     if verbose:
         print('Setting span type {}'.format(self.span_type))
     self.verbose = verbose
Beispiel #5
0
 def __init__(self, model, span_type, verbose):
     self.model = model
     self.idx2label = revlut(model.labels)
     self.span_type = span_type
     if verbose:
         logger.info('Setting span type %s', self.span_type)
     self.verbose = verbose
Beispiel #6
0
    def train(self, checkpoint=None):

        num_ex = self.config_params['num_valid_to_show']

        rlut1 = revlut(self.feat2src[self.primary_key])
        rlut2 = revlut(self.feat2tgt)
        if num_ex > 0:
            logger.info('Showing examples')
            preproc = self.config_params.get('preproc', {})
            show_ex_fn = preproc['show_ex']
            self.config_params['train']['after_train_fn'] = lambda model: show_ex_fn(model,
                                                                                     self.valid_data, rlut1, rlut2,
                                                                                     self.feat2tgt,
                                                                                     preproc['mxlen'], False, 0,
                                                                                     num_ex, reverse=False)
        self.config_params['train']['tgt_rlut'] = rlut2
        return super(EncoderDecoderTask, self).train(checkpoint)
Beispiel #7
0
 def __init__(self, vocabs=None, vectorizers=None, model=None, preproc='client'):
     super(TaggerService, self).__init__(vocabs, vectorizers, model, preproc)
     if hasattr(self.model, 'return_labels'):
         self.return_labels = self.model.return_labels
     else:
         self.return_labels = False  # keeping the default tagger behavior
     if not self.return_labels:
         self.label_vocab = revlut(self.get_labels())
Beispiel #8
0
 def __init__(self, vocabs=None, vectorizers=None, model=None, preproc='client'):
     super(TaggerService, self).__init__(vocabs, vectorizers, model, preproc)
     if hasattr(self.model, 'return_labels'):
         self.return_labels = self.model.return_labels
     else:
         self.return_labels = False  # keeping the default tagger behavior
     if not self.return_labels:
         self.label_vocab = revlut(self.get_labels())
Beispiel #9
0
    def train(self, checkpoint=None):

        num_ex = self.config_params['num_valid_to_show']

        rlut1 = revlut(self.feat2src[self.primary_key])
        rlut2 = revlut(self.feat2tgt)
        if num_ex > 0:
            logger.info('Showing examples')
            preproc = self.config_params.get('preproc', {})
            show_ex_fn = preproc['show_ex']
            self.config_params['train']['after_train_fn'] = lambda model: show_ex_fn(model,
                                                                                     self.valid_data, rlut1, rlut2,
                                                                                     self.feat2tgt,
                                                                                     preproc['mxlen'], False, 0,
                                                                                     num_ex, reverse=False)
        self.config_params['train']['tgt_rlut'] = rlut2
        return super(EncoderDecoderTask, self).train(checkpoint)
 def __init__(self, model, **kwargs):
     super(TaggerTrainerPyTorch, self).__init__()
     self.gpu = not bool(kwargs.get('nogpu', False))
     self.model = model
     self.idx2label = revlut(self.model.labels)
     self.clip = float(kwargs.get('clip', 5))
     self.optimizer, self.scheduler = pytorch_prepare_optimizer(
         self.model, **kwargs)
     if self.gpu:
         self.model = model.to_gpu()
Beispiel #11
0
    def __init__(self, model, **kwargs):

        super(TaggerTrainerDyNet, self).__init__()

        self.span_type = kwargs.get('span_type', 'iob')
        self.gpu = not bool(kwargs.get('nogpu', False))
        self.model = model
        self.idx2label = revlut(self.model.labels)
        self.autobatchsz = kwargs.get('autobatchsz')
        self.labels = model.labels
        self.optimizer = optimizer(model, **kwargs)
Beispiel #12
0
 def __init__(self, model, **kwargs):
     super(TaggerTrainerPyTorch, self).__init__()
     self.gpu = not bool(kwargs.get('nogpu', False))
     # By default support IOB1/IOB2
     self.span_type = kwargs.get('span_type', 'iob')
     print('Setting span type {}'.format(self.span_type))
     self.model = model
     self.idx2label = revlut(self.model.labels)
     self.clip = float(kwargs.get('clip', 5))
     self.optimizer, self.scheduler = pytorch_prepare_optimizer(self.model, **kwargs)
     if self.gpu:
         self.model = model.to_gpu()
Beispiel #13
0
    def __init__(self, model, **kwargs):

        super(TaggerTrainerDyNet, self).__init__()

        self.span_type = kwargs.get('span_type', 'iob')
        self.gpu = not bool(kwargs.get('nogpu', False))
        self.model = model
        self.idx2label = revlut(self.model.labels)
        self.autobatchsz = kwargs.get('autobatchsz')
        self.labels = model.labels
        self.optimizer = OptimizerManager(model, **kwargs)
        self.nsteps = kwargs.get('nsteps', six.MAXSIZE)
Beispiel #14
0
    def __init__(self, model, **kwargs):

        super(TaggerTrainerDyNet, self).__init__()

        self.span_type = kwargs.get('span_type', 'iob')
        logger.info('Setting span type %s', self.span_type)
        self.gpu = not bool(kwargs.get('nogpu', False))
        self.model = model
        self.idx2label = revlut(self.model.labels)
        self.autobatchsz = kwargs.get('autobatchsz')
        self.labels = model.labels
        self.optimizer = OptimizerManager(model, **kwargs)
        self.nsteps = kwargs.get('nsteps', six.MAXSIZE)
        self.verbose = kwargs.get('verbose', False)
Beispiel #15
0
    def __init__(self, model, **kwargs):
        super(TaggerTrainerPyTorch, self).__init__()
        self.gpu = not bool(kwargs.get('nogpu', False))
        # By default support IOB1/IOB2
        self.span_type = kwargs.get('span_type', 'iob')
        self.verbose = kwargs.get('verbose', False)

        logger.info('Setting span type %s', self.span_type)
        self.model = model
        self.idx2label = revlut(self.model.labels)
        self.clip = float(kwargs.get('clip', 5))
        self.optimizer = OptimizerManager(self.model, **kwargs)
        if self.gpu:
            self.model = model.to_gpu()
        self.nsteps = kwargs.get('nsteps', six.MAXSIZE)
Beispiel #16
0
    def __init__(self, model, **kwargs):
        super(TaggerTrainerPyTorch, self).__init__()
        self.gpu = not bool(kwargs.get('nogpu', False))
        # By default support IOB1/IOB2
        self.span_type = kwargs.get('span_type', 'iob')
        self.verbose = kwargs.get('verbose', False)

        if self.verbose:
            logger.info('Setting span type %s', self.span_type)
        self.model = model
        self.idx2label = revlut(self.model.labels)
        self.clip = float(kwargs.get('clip', 5))
        self.optimizer = OptimizerManager(self.model, **kwargs)
        if self.gpu:
            self.model = model.to_gpu()
        self.nsteps = kwargs.get('nsteps', six.MAXSIZE)
Beispiel #17
0
    def predict_text(self,
                     tokens,
                     mxlen,
                     maxw,
                     zero_alloc=np.zeros,
                     word_trans_fn=lowercase):
        """
        Utility function to convert lists of sentence tokens to integer value one-hots which
        are then passed to the tagger.  The resultant output is then converted back to label and token
        to be printed.

        This method is not aware of any input features other than words and characters (and lengths).  If you
        wish to use other features and have a custom model that is aware of those, use `predict` directly.

        :param tokens: 
        :param mxlen: 
        :param maxw: 
        :param zero_alloc: Define
        :param word_trans_fn:
        :return: 
        """
        words_vocab = self.get_vocab(vocab_type='word')
        chars_vocab = self.get_vocab(vocab_type='char')
        # This might be inefficient if the label space is large
        label_vocab = revlut(self.get_labels())
        xs = zero_alloc((1, mxlen), dtype=int)
        xs_ch = zero_alloc((1, mxlen, maxw), dtype=int)
        lengths = zero_alloc(1, dtype=int)
        lengths[0] = min(len(tokens), mxlen)
        for j in range(mxlen):

            if j == len(tokens):
                break

            w = tokens[j]
            nch = min(len(w), maxw)

            xs[0, j] = words_vocab.get(word_trans_fn(w), 0)
            for k in range(nch):
                xs_ch[0, j, k] = chars_vocab.get(w[k], 0)

        indices = self.predict({'x': xs, 'xch': xs_ch, 'lengths': lengths})[0]
        output = []
        for j in range(lengths[0]):
            output.append((tokens[j], label_vocab[indices[j]]))
        return output
Beispiel #18
0
    def __init__(self, vocabs=None, vectorizers=None, model=None, preproc='client'):
        super(EncoderDecoderService, self).__init__(None, None, model, preproc)
        self.src_vocabs = {}
        self.tgt_vocab = None
        for k, vocab in vocabs.items():
            if k == 'tgt':
                self.tgt_vocab = vocab
            else:
                self.src_vocabs[k] = vocab

        self.tgt_idx_to_token = revlut(self.tgt_vocab)
        self.src_vectorizers = {}
        self.tgt_vectorizer = None
        for k, vectorizer, in vectorizers.items():
            if k == 'tgt':
                self.tgt_vectorizer = vectorizer
            else:
                self.src_vectorizers[k] = vectorizer
Beispiel #19
0
    def __init__(self, vocabs=None, vectorizers=None, model=None, preproc='client'):
        super(EncoderDecoderService, self).__init__(None, None, model, preproc)
        self.src_vocabs = {}
        self.tgt_vocab = None
        for k, vocab in vocabs.items():
            if k == 'tgt':
                self.tgt_vocab = vocab
            else:
                self.src_vocabs[k] = vocab

        self.tgt_idx_to_token = revlut(self.tgt_vocab)
        self.src_vectorizers = {}
        self.tgt_vectorizer = None
        for k, vectorizer, in vectorizers.items():
            if k == 'tgt':
                self.tgt_vectorizer = vectorizer
            else:
                self.src_vectorizers[k] = vectorizer
Beispiel #20
0
    def predict_text(self,
                     tokens,
                     mxlen,
                     maxw,
                     zero_alloc=np.zeros,
                     word_trans_fn=lowercase):
        """
        Utility function to convert lists of sentence tokens to integer value one-hots which
        are then passed to the tagger.  The resultant output is then converted back to label and token
        to be printed
        :param tokens: 
        :param mxlen: 
        :param maxw: 
        :param zero_alloc: Define
        :param word_trans_fn:
        :return: 
        """
        words_vocab = self.get_vocab(vocab_type='word')
        chars_vocab = self.get_vocab(vocab_type='char')
        # This might be inefficient if the label space is large
        label_vocab = revlut(self.get_labels())
        xs = zero_alloc((1, mxlen), dtype=int)
        xs_ch = zero_alloc((1, mxlen, maxw), dtype=int)
        lengths = zero_alloc(1, dtype=int)
        lengths[0] = min(len(tokens), mxlen)
        for j in range(mxlen):

            if j == len(tokens):
                break

            w = tokens[j]
            nch = min(len(w), maxw)

            xs[0, j] = words_vocab.get(word_trans_fn(w), 0)
            for k in range(nch):
                xs_ch[0, j, k] = chars_vocab.get(w[k], 0)

        indices = self.predict(xs, xs_ch, lengths)[0]
        output = []
        for j in range(lengths[0]):
            output.append((tokens[j], label_vocab[indices[j]]))
        return output
Beispiel #21
0
    def __init__(self, model, **kwargs):
        super(TaggerTrainerPyTorch, self).__init__()
        self.gpu = not bool(kwargs.get('nogpu', False))
        optim = kwargs.get('optim', 'adam')
        eta = float(kwargs.get('eta', 0.01))
        mom = float(kwargs.get('mom', 0.9))
        self.clip = float(kwargs.get('clip', 5))
        self.model = model
        self.idx2label = revlut(self.model.labels)
        if optim == 'adadelta':
            self.optimizer = torch.optim.Adadelta(model.parameters(), lr=eta)
        elif optim == 'adam':
            self.optimizer = torch.optim.Adam(model.parameters(), lr=eta)
        elif optim == 'rmsprop':
            self.optimizer = torch.optim.RMSprop(model.parameters(), lr=eta)
        else:
            self.optimizer = torch.optim.SGD(model.parameters(), lr=eta, momentum=mom)

        self.crit = model.get_criterion()
        if self.gpu:
            self.model = model.cuda()
            self.crit.cuda()
Beispiel #22
0
    def __init__(self, model, **kwargs):
        super(TaggerTrainerPyTorch, self).__init__()
        self.gpus = int(kwargs.get('gpus', 1))
        # By default support IOB1/IOB2
        self.span_type = kwargs.get('span_type', 'iob')
        self.verbose = kwargs.get('verbose', False)

        logger.info('Setting span type %s', self.span_type)
        self.model = model
        self.idx2label = revlut(self.model.labels)
        self.clip = float(kwargs.get('clip', 5))
        self.optimizer = OptimizerManager(self.model, **kwargs)
        if self.gpus > 1:
            logger.info(
                "Trainer for PyTorch tagger currently doesnt support multiple GPUs.  Setting to 1"
            )
            self.gpus = 1
        if self.gpus > 0:
            self.model = model.to_gpu()
        else:
            logger.warning("Requested training on CPU.  This will be slow.")

        self.nsteps = kwargs.get('nsteps', six.MAXSIZE)
Beispiel #23
0
    def predict_text(self, tokens, **kwargs):
        """
        Utility function to convert lists of sentence tokens to integer value one-hots which
        are then passed to the tagger.  The resultant output is then converted back to label and token
        to be printed.

        This method is not aware of any input features other than words and characters (and lengths).  If you
        wish to use other features and have a custom model that is aware of those, use `predict` directly.

        :param tokens: (``list``) A list of tokens

        """

        featurizer = kwargs.get('featurizer')
        if featurizer is None:
            mxlen = kwargs.get(
                'mxlen', self.mxlen if hasattr(self, 'mxlen') else len(tokens))
            maxw = kwargs.get(
                'maxw', self.maxw if hasattr(self, 'maxw') else max(
                    [len(token) for token in tokens]))
            zero_alloc = kwargs.get('zero_alloc', np.zeros)
            featurizer = WordCharLength(self, mxlen, maxw, zero_alloc)

        # This might be inefficient if the label space is large

        label_vocab = revlut(self.get_labels())
        #lengths = zero_alloc(1, dtype=int)
        #lengths[0] = min(len(tokens), mxlen)

        data = featurizer.run(tokens)
        lengths = data['lengths']
        indices = self.predict(data)[0]
        output = []
        for j in range(lengths[0]):
            output.append((tokens[j], label_vocab[indices[j].item()]))
        return output
Beispiel #24
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.idx_to_token = revlut(self.vocabs[self.model.tgt_key])
Beispiel #25
0
 def __init__(self, *args, **kwargs):
     super(LanguageModelService, self).__init__(*args, **kwargs)
     self.idx_to_token = revlut(self.vocabs[self.model.tgt_key])
Beispiel #26
0
 def __init__(self, model):
     self.model = model
     self.idx2label = revlut(model.labels)
Beispiel #27
0
 def __init__(self, *args, **kwargs):
     super(LanguageModelService, self).__init__(*args, **kwargs)
     self.idx_to_token = revlut(self.vocabs[self.model.tgt_key])
Beispiel #28
0
 def __init__(self, vocabs=None, vectorizers=None, model=None):
     super(TaggerService, self).__init__(vocabs, vectorizers, model)
     self.label_vocab = revlut(self.get_labels())