コード例 #1
0
ファイル: bert_vocab.py プロジェクト: tgeral68/OpenNIR
 def __init__(self, vocabulary):
     super().__init__(vocabulary)
     layer = vocabulary.config['layer']
     if layer == -1:
         layer = None
     bert_model = bert_models.get_model(vocabulary.config['bert_base'], vocabulary.logger)
     self.bert = CustomBertModelWrapper.from_pretrained(bert_model, depth=layer)
     if vocabulary.config['bert_weights']:
         weight_path = os.path.join(util.path_vocab(vocabulary), vocabulary.config['bert_weights'])
         with vocabulary.logger.duration('loading BERT weights from {}'.format(weight_path)):
             self.bert.load_state_dict(torch.load(weight_path), strict=False)
     self.CLS = vocabulary.tok2id('[CLS]')
     self.SEP = vocabulary.tok2id('[SEP]')
     self.bert.set_trainable(vocabulary.config['train'])
コード例 #2
0
 def run(self):
     if self.config['bert_weights'] == '':
         raise ValueError('must provide pipeline.bert_weights setting (name of weights file)')
     weight_file = os.path.join(util.path_vocab(self.vocab), self.config['bert_weights'])
     if os.path.exists(weight_file) and not self.config['overwrite']:
         raise ValueError(f'{weight_file} already exists. Please rename pipeline.bert_weights or set pipeline.overwrite=True')
     self._load_ranker_weights(self.ranker, self.vocab, self.trainer, self.valid_pred, self.train_ds)
     old_sate_dict = self.ranker.state_dict()
     new_state_dict = OrderedDict()
     for key in old_sate_dict:
         if key.startswith('encoder.bert.'):
             new_state_dict[key[len('encoder.bert.'):]] = old_sate_dict[key]
     torch.save(new_state_dict, weight_file)
     self.logger.info(f'new BERT sate dict saved to {weight_file}')
コード例 #3
0
ファイル: wordvec_vocab.py プロジェクト: tgeral68/OpenNIR
 def __init__(self, config, logger, random):
     super().__init__(config, logger)
     self.random = random
     path = util.path_vocab(self)
     cache_path = os.path.join(path, '{source}-{variant}.p'.format(**self.config))
     if not os.path.exists(cache_path):
         fn = _SOURCES[self.config['source']]
         if isinstance(fn, dict):
             fn = fn[self.config['variant']]
         self._terms, self._weights = fn(self.logger)
         with logger.duration(f'writing cached at {cache_path}'):
             with open(cache_path, 'wb') as f:
                 pickle.dump((self._terms, self._weights), f, protocol=4)
     else:
         with logger.duration(f'reading cached at {cache_path}'):
             with open(cache_path, 'rb') as f:
                 self._terms, self._weights = pickle.load(f)
     self._term2idx = {t: i for i, t in enumerate(self._terms)}