Ejemplo n.º 1
0
 def __init__(self, filepath: str, padding=PAD, name=None, **kwargs):
     self.padding = padding.encode('utf-8')
     self.filepath = filepath
     filepath = get_resource(filepath)
     assert os.path.isfile(
         filepath), f'Resolved path {filepath} is not a file'
     existed = global_cache.get(filepath, None)
     if existed:
         logger.debug('Use cached fasttext model [{}].'.format(filepath))
         self.model = existed
     else:
         logger.debug('Loading fasttext model from [{}].'.format(filepath))
         # fasttext print a blank line here
         with stdout_redirected(to=os.devnull, stdout=sys.stderr):
             self.model = fasttext.load_model(filepath)
         global_cache[filepath] = self.model
     kwargs.pop('input_dim', None)
     kwargs.pop('output_dim', None)
     kwargs.pop('mask_zero', None)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim=len(self.model.words),
                      output_dim=self.model['king'].size,
                      mask_zero=padding is not None,
                      trainable=False,
                      dtype=tf.string,
                      name=name,
                      **kwargs)
     embed_fn = np.frompyfunc(self.embed, 1, 1)
     # vf = np.vectorize(self.embed, otypes=[np.ndarray])
     self._embed_np = embed_fn
Ejemplo n.º 2
0
def build_transformer(transformer,
                      max_seq_length,
                      num_labels,
                      tagging=True,
                      tokenizer_only=False):
    if transformer in albert_models_google:
        from bert.tokenization.albert_tokenization import FullTokenizer
        model_url = albert_models_google[transformer]
        albert = True
    elif transformer in bert_models_google:
        from bert.tokenization.bert_tokenization import FullTokenizer
        model_url = bert_models_google[transformer]
        albert = False
    else:
        raise ValueError(
            f'Unknown model {transformer}, available ones: {list(bert_models_google.keys()) + list(albert_models_google.keys())}'
        )
    bert_dir = get_resource(model_url)
    vocab = glob.glob(os.path.join(bert_dir, '*vocab*.txt'))
    assert len(vocab) == 1, 'No vocab found or unambiguous vocabs found'
    vocab = vocab[0]
    # noinspection PyTypeChecker
    tokenizer = FullTokenizer(vocab_file=vocab)
    if tokenizer_only:
        return tokenizer
    bert_params = bert.params_from_pretrained_ckpt(bert_dir)
    l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")
    l_input_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                        dtype='int32',
                                        name="input_ids")
    l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                       dtype='int32',
                                       name="mask_ids")
    l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                             dtype='int32',
                                             name="token_type_ids")
    output = l_bert([l_input_ids, l_token_type_ids], mask=l_mask_ids)
    if not tagging:
        output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
    if bert_params.hidden_dropout:
        output = tf.keras.layers.Dropout(bert_params.hidden_dropout,
                                         name='hidden_dropout')(output)
    logits = tf.keras.layers.Dense(
        num_labels,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(
            bert_params.initializer_range))(output)
    model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids],
                           outputs=logits)
    model.build(input_shape=(None, max_seq_length))
    ckpt = glob.glob(os.path.join(bert_dir, '*.index'))
    assert ckpt, f'No checkpoint found under {bert_dir}'
    ckpt, _ = os.path.splitext(ckpt[0])
    with stdout_redirected(to=os.devnull):
        if albert:
            skipped_weight_value_tuples = load_stock_weights(l_bert, ckpt)
        else:
            skipped_weight_value_tuples = bert.load_bert_weights(l_bert, ckpt)
    assert 0 == len(skipped_weight_value_tuples
                    ), f'failed to load pretrained {transformer}'
    return model, tokenizer
Ejemplo n.º 3
0
 def __init__(self, filepath: str, src, dst=None, **kwargs) -> None:
     if not dst:
         dst = src + '_fasttext'
     self.filepath = filepath
     flash(f'Loading fasttext model {filepath} [blink][yellow]...[/yellow][/blink]')
     filepath = get_resource(filepath)
     with stdout_redirected(to=os.devnull, stdout=sys.stderr):
         self._model = fasttext.load_model(filepath)
     flash('')
     output_dim = self._model['king'].size
     super().__init__(output_dim, src, dst)
Ejemplo n.º 4
0
def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False):
    spm_model_file = None
    if transformer in zh_albert_models_google:
        from bert.tokenization.albert_tokenization import FullTokenizer
        model_url = zh_albert_models_google[transformer]
        albert = True
    elif transformer in albert_models_tfhub:
        from bert.tokenization.albert_tokenization import FullTokenizer
        with stdout_redirected(to=os.devnull):
            model_url = fetch_tfhub_albert_model(transformer,
                                                 os.path.join(hanlp_home(), 'thirdparty', 'tfhub.dev', 'google',
                                                              transformer))
        albert = True
        spm_model_file = glob.glob(os.path.join(model_url, 'assets', '*.model'))
        assert len(spm_model_file) == 1, 'No vocab found or unambiguous vocabs found'
        spm_model_file = spm_model_file[0]
    elif transformer in bert_models_google:
        from bert.tokenization.bert_tokenization import FullTokenizer
        model_url = bert_models_google[transformer]
        albert = False
    else:
        raise ValueError(
            f'Unknown model {transformer}, available ones: {list(bert_models_google.keys()) + list(zh_albert_models_google.keys()) + list(albert_models_tfhub.keys())}')
    bert_dir = get_resource(model_url)
    if spm_model_file:
        vocab = glob.glob(os.path.join(bert_dir, 'assets', '*.vocab'))
    else:
        vocab = glob.glob(os.path.join(bert_dir, '*vocab*.txt'))
    assert len(vocab) == 1, 'No vocab found or unambiguous vocabs found'
    vocab = vocab[0]
    lower_case = any(key in transformer for key in ['uncased', 'multilingual', 'chinese', 'albert'])
    if spm_model_file:
        # noinspection PyTypeChecker
        tokenizer = FullTokenizer(vocab_file=vocab, spm_model_file=spm_model_file, do_lower_case=lower_case)
    else:
        tokenizer = FullTokenizer(vocab_file=vocab, do_lower_case=lower_case)
    if tokenizer_only:
        return tokenizer
    if spm_model_file:
        bert_params = albert_params(bert_dir)
    else:
        bert_params = bert.params_from_pretrained_ckpt(bert_dir)
    l_bert = bert.BertModelLayer.from_params(bert_params, name='albert' if albert else "bert")
    l_input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="input_ids")
    l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="mask_ids")
    l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="token_type_ids")
    output = l_bert([l_input_ids, l_token_type_ids], mask=l_mask_ids)
    if not tagging:
        output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
    if bert_params.hidden_dropout:
        output = tf.keras.layers.Dropout(bert_params.hidden_dropout, name='hidden_dropout')(output)
    logits = tf.keras.layers.Dense(num_labels, kernel_initializer=tf.keras.initializers.TruncatedNormal(
        bert_params.initializer_range))(output)
    model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids], outputs=logits)
    model.build(input_shape=(None, max_seq_length))
    if not spm_model_file:
        ckpt = glob.glob(os.path.join(bert_dir, '*.index'))
        assert ckpt, f'No checkpoint found under {bert_dir}'
        ckpt, _ = os.path.splitext(ckpt[0])
    with stdout_redirected(to=os.devnull):
        if albert:
            if spm_model_file:
                skipped_weight_value_tuples = bert.load_albert_weights(l_bert, bert_dir)
            else:
                # noinspection PyUnboundLocalVariable
                skipped_weight_value_tuples = load_stock_weights(l_bert, ckpt)
        else:
            # noinspection PyUnboundLocalVariable
            skipped_weight_value_tuples = bert.load_bert_weights(l_bert, ckpt)
    assert 0 == len(skipped_weight_value_tuples), f'failed to load pretrained {transformer}'
    return model, tokenizer