Beispiel #1
0
    def __init__(self, model_file):
        self.model_file = os.path.abspath(model_file)

        # label
        label_map_reverse_file = os.path.join(
            self.model_file, 'label_map_reverse.json')
        with tf.gfile.GFile(label_map_reverse_file, 'r') as f:
            self.label_map_reverse = json.load(f)
        self.labels = [item[1] for item in sorted(
            self.label_map_reverse.items(), key=lambda i: i[0])]

        # model
        model_config_file = os.path.join(
            self.model_file, 'model_config.json')
        with tf.gfile.GFile(model_config_file, 'r') as f:
            self.model_config = json.load(f)
        self.model_name = self.model_config.get('model_name') or None
        self.model_type = self.model_config.get('model_type') or None
        self.vocab_file = self.model_config.get('vocab_file') or None
        self.max_seq_len = self.model_config.get('max_seq_len') or 512
        if not self.model_name:
            assert all([self.vocab_file, self.model_type]), \
                'If not given model_name provided by open_sources, ' \
                'you should specify the model_type and vocab_file.'
        else:
            assert self.model_name in pretrained_names, \
                '%s not provided by open_sources' % self.model_name
            self.model_type = pretrained_types.get(self.model_name).split('_')[0]
            pretrained_dir = get_pretrained_model(pretrained_name=self.model_name)
            self.vocab_file = os.path.join(pretrained_dir, 'vocab.txt')

        # tokenizer
        if self.model_type == 'bert':
            self.tokenizer = FullTokenizer(self.vocab_file)
        elif self.model_type == 'albert':
            self.tokenizer = FullTokenizer(self.vocab_file)
        else:
            raise ValueError('model_type %s unknown.' % self.model_type)

        # processor
        self._load_processor()

        # build graph
        self._build()

        # load cache
        self.cache_file = os.path.join(
            self.model_file, 'cache.txt')
        self._load_cache()
Beispiel #2
0
 def __init__(self, model_name=None, model_type=None, vocab_file=None,
              config_file=None, init_checkpoint_file=None):
     self.model_name = model_name.lower()
     if not model_name:
         assert all([model_type, vocab_file, config_file, init_checkpoint_file]), \
             'If not given model_name provided by open_sources, ' \
             'you should specify all the details of model.'
         self.model_type = model_type.lower()
         self.vocab_file = vocab_file
         self.config_file = config_file
         self.init_checkpoint_file = init_checkpoint_file
     else:
         assert self.model_name in pretrained_names, \
             '%s not provided by open_sources.' % self.model_name
         self.model_type = pretrained_types.get(self.model_name).split('_')[0]
         self._from_pretrained()
     self._set_pretrained_model()