Python get_tokenizer Exemples, lyc.utils.get_tokenizer Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : demo.py Projet : liyucheng09/sentence_embedding

 def __init__(self, model_path, tokenizer_path, max_length=64):
     self.model = get_model(
         SimCSE,
         model_path,
         cache_dir='/Users/liyucheng/projects/model_cache/')
     self.tokenizer = get_tokenizer(tokenizer_path, is_zh=True)
     self.max_length = max_length

Exemple #2

0

Afficher le fichier

Fichier : demo.py Projet : liyucheng09/sentence_embedding

    def __init__(self,
                 model_path,
                 max_length=64,
                 n_components=768,
                 kernel_bias_path=None,
                 corpus_for_kernel_computing=None,
                 pool='first_last_avg_pooling'):
        """[summary]

        Args:
            model_path ([type]): [description]
            max_length (int, optional): [description]. Defaults to 64.
            n_components (int, optional): [description]. Defaults to 768.
            kernel_bias_path ([type], optional): [description]. Defaults to None.
            corpus_for_kernel_computing ([type], optional): 训练kernel和bias需要的语料，纯txt，一行一个句子. Defaults to None.
        """
        self.model = get_model(Sentence, model_path)
        self.tokenizer = get_tokenizer(model_path, is_zh=True)
        self.max_length = max_length
        self.n_components = n_components
        self.pool = pool
        self._get_kernel_and_bias(model_path, kernel_bias_path,
                                  corpus_for_kernel_computing)

Exemple #3

0

Afficher le fichier

Fichier : rerank_after_recall.py Projet : liyucheng09/sentence_embedding

                run.set_answer(index, topk)

    def output(self, attrs, file):
        output = {}
        with open(file, 'w', encoding='utf-8') as f:
            for run in self.runs:
                output[run.query] = run.output(attrs)
            json.dump(output, f)

    def acc(self, topks=[1, 3, 5]):
        self.get_topk(topks)
        r = {}
        for topk in topks:
            positive = np.array([run.in_topk(topk) for run in self.runs])
            acc = positive.sum() / len(self.runs)
            r[topk] = acc
        return r


if __name__ == '__main__':

    recall_json, model_path, = sys.argv[1:]

    model_path = 'checkpoints/rerank/checkpoint-5000'
    model = get_model(rerank, model_path)
    tokenizer = get_tokenizer(model_path)

    eval = EvalSession(recall_json, tokenizer, model)
    print('ACC: ', eval.acc())
    eval.output(['top1', 'top3'], 'r.json')

Exemple #4

0

Afficher le fichier

from lyc.eval import SimCSEEvalAccComputing

EPOCHES = 1

if __name__ == '__main__':
    model_path, tokenizer_path, faq_table, = sys.argv[1:]

    args = get_base_hf_args(output_dir='checkpoints/simcse5',
                            train_batch_size=1,
                            epochs=1,
                            lr=1e-5,
                            save_steps=500,
                            save_strategy='steps',
                            save_total_limit=10)

    tokenizer = get_tokenizer(tokenizer_path, is_zh=True)

    ds = SimCSEDSForYEZI(faq_table,
                         tokenizer,
                         steps=4000,
                         repeat=False,
                         csv=True)

    model = get_model(SimCSE, model_path, cache_dir='../model_cache')
    model.config.binary = True

    trainer = HfTrainer(model=model,
                        args=args,
                        train_dataset=ds,
                        tokenizer=tokenizer)
    trainer.train()

Exemple #5

0

Afficher le fichier

@dataclass
class PblmArgs(TrainingArgs):
    is_zh: bool = True
    ds_name: str = 'train'
    d_model: int = 768
    prefix: str = 'v0.1'


if __name__ == '__main__':
    args = get_args(PblmArgs)
    args.gpus = torch.cuda.device_count()
    args.multigpu = torch.cuda.device_count() > 1

    tokenizer = get_tokenizer(args.tokenizer_name_or_path,
                              is_zh=args.is_zh,
                              max_length=args.max_sent_len,
                              min_sent_length=args.min_sent_len)
    ds = get_tokenized_ds(args.dataset_scripts,
                          args.train_data,
                          tokenizer,
                          max_length=args.max_sent_len,
                          min_sent_length=args.min_sent_len,
                          shuffle=True,
                          tokenize_func='no_padding')
    train_ds = ds[args.ds_name]

    processor.block_size = 512
    processor.tokenizer = tokenizer
    train_ds = train_ds.map(processor.lm_group_texts, batched=True)
    train_ds = train_ds.map(processor.get_true_length, batched=True)
    train_dl = get_dataloader(train_ds,