def __init__(self, model_path, tokenizer_path, max_length=64): self.model = get_model( SimCSE, model_path, cache_dir='/Users/liyucheng/projects/model_cache/') self.tokenizer = get_tokenizer(tokenizer_path, is_zh=True) self.max_length = max_length
def __init__(self, model_path, max_length=64, n_components=768, kernel_bias_path=None, corpus_for_kernel_computing=None, pool='first_last_avg_pooling'): """[summary] Args: model_path ([type]): [description] max_length (int, optional): [description]. Defaults to 64. n_components (int, optional): [description]. Defaults to 768. kernel_bias_path ([type], optional): [description]. Defaults to None. corpus_for_kernel_computing ([type], optional): 训练kernel和bias需要的语料,纯txt,一行一个句子. Defaults to None. """ self.model = get_model(Sentence, model_path) self.tokenizer = get_tokenizer(model_path, is_zh=True) self.max_length = max_length self.n_components = n_components self.pool = pool self._get_kernel_and_bias(model_path, kernel_bias_path, corpus_for_kernel_computing)
run.set_answer(index, topk) def output(self, attrs, file): output = {} with open(file, 'w', encoding='utf-8') as f: for run in self.runs: output[run.query] = run.output(attrs) json.dump(output, f) def acc(self, topks=[1, 3, 5]): self.get_topk(topks) r = {} for topk in topks: positive = np.array([run.in_topk(topk) for run in self.runs]) acc = positive.sum() / len(self.runs) r[topk] = acc return r if __name__ == '__main__': recall_json, model_path, = sys.argv[1:] model_path = 'checkpoints/rerank/checkpoint-5000' model = get_model(rerank, model_path) tokenizer = get_tokenizer(model_path) eval = EvalSession(recall_json, tokenizer, model) print('ACC: ', eval.acc()) eval.output(['top1', 'top3'], 'r.json')
from lyc.eval import SimCSEEvalAccComputing EPOCHES = 1 if __name__ == '__main__': model_path, tokenizer_path, faq_table, = sys.argv[1:] args = get_base_hf_args(output_dir='checkpoints/simcse5', train_batch_size=1, epochs=1, lr=1e-5, save_steps=500, save_strategy='steps', save_total_limit=10) tokenizer = get_tokenizer(tokenizer_path, is_zh=True) ds = SimCSEDSForYEZI(faq_table, tokenizer, steps=4000, repeat=False, csv=True) model = get_model(SimCSE, model_path, cache_dir='../model_cache') model.config.binary = True trainer = HfTrainer(model=model, args=args, train_dataset=ds, tokenizer=tokenizer) trainer.train()
@dataclass class PblmArgs(TrainingArgs): is_zh: bool = True ds_name: str = 'train' d_model: int = 768 prefix: str = 'v0.1' if __name__ == '__main__': args = get_args(PblmArgs) args.gpus = torch.cuda.device_count() args.multigpu = torch.cuda.device_count() > 1 tokenizer = get_tokenizer(args.tokenizer_name_or_path, is_zh=args.is_zh, max_length=args.max_sent_len, min_sent_length=args.min_sent_len) ds = get_tokenized_ds(args.dataset_scripts, args.train_data, tokenizer, max_length=args.max_sent_len, min_sent_length=args.min_sent_len, shuffle=True, tokenize_func='no_padding') train_ds = ds[args.ds_name] processor.block_size = 512 processor.tokenizer = tokenizer train_ds = train_ds.map(processor.lm_group_texts, batched=True) train_ds = train_ds.map(processor.get_true_length, batched=True) train_dl = get_dataloader(train_ds,