def get_tokenizer(pretrained_model_name_or_path: str, *args, batch_size: int = 8, **kwargs) -> T5BatchTokenizer: return T5BatchTokenizer(AutoTokenizer.from_pretrained( pretrained_model_name_or_path, use_fast=False, *args, **kwargs), batch_size=batch_size)
def construct_t5(options: PassageRankingEvaluationOptions) -> Reranker: device = torch.device(options.device) model = T5ForConditionalGeneration.from_pretrained( options.model, from_tf=options.from_tf).to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.model_type) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) return T5Reranker(model, tokenizer)
def build_ranker(self) -> T5Reranker: loader = CachedT5ModelLoader(settings.t5_model_dir, settings.cache_dir, 'ranker', settings.t5_model_type, settings.flush_cache) device = torch.device(settings.t5_device) model = loader.load().to(device).eval() tokenizer = T5Tokenizer.from_pretrained(settings.t5_model_type) batch_tokenizer = T5BatchTokenizer(tokenizer, settings.t5_batch_size) return T5Reranker(model, batch_tokenizer)
def construct_t5(options: KaggleEvaluationOptions) -> Reranker: loader = CachedT5ModelLoader(SETTINGS.t5_model_dir, SETTINGS.cache_dir, 'ranker', SETTINGS.t5_model_type, SETTINGS.flush_cache) device = torch.device(options.device) model = loader.load().to(device).eval() tokenizer = AutoTokenizer.from_pretrained( options.model_name, do_lower_case=options.do_lower_case) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) return T5Reranker(model, tokenizer)
def construct_t5(options: PassageRankingEvaluationOptions) -> Reranker: loader = CachedT5ModelLoader(options.model_name_or_path, SETTINGS.cache_dir, 'ranker', options.model_type, SETTINGS.flush_cache) device = torch.device(options.device) model = loader.load().to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.model_type) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) return T5Reranker(model, tokenizer)
def __init__( self, model_name_or_instance: Union[ str, T5ForConditionalGeneration] = 'castorini/monoT5-base-msmarco', tokenizer_name_or_instance: Union[ str, QueryDocumentBatchTokenizer] = 't5-base'): if isinstance(model_name_or_instance, str): device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') model_name_or_instance = T5ForConditionalGeneration.from_pretrained( model_name_or_instance).to(device).eval() self.model = model_name_or_instance if isinstance(tokenizer_name_or_instance, str): tokenizer_name_or_instance = T5BatchTokenizer( AutoTokenizer.from_pretrained(tokenizer_name_or_instance), batch_size=8) self.tokenizer = tokenizer_name_or_instance self.device = next(self.model.parameters(), None).device
def get_tokenizer(pretrained_model_name_or_path: str = 't5-base', *args, batch_size: int = 8, **kwargs) -> T5BatchTokenizer: return T5BatchTokenizer( AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *args, **kwargs), batch_size=batch_size )
parser.add_argument('--input_file', type=str, required=True) parser.add_argument('--query_field', type=str, required=True) args = parser.parse_args() model_name = 'castorini/monot5-base-msmarco' tokenizer_name = 't5-base' batch_size = 8 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = T5ForConditionalGeneration.from_pretrained(model_name) model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) tokenizer = T5BatchTokenizer(tokenizer, batch_size) reranker = T5Reranker(model, tokenizer) query_dict = {} with open("train_queries.json") as f: for line in f: temp = json.loads(line.strip()) query_dict[temp['qid']] = temp[args.query_field] print(len(query_dict)) pass_dict = {} with open(f"{args.input_file}.trec_json") as f: for line in f: a, b = line.strip().split("indri #") qid, _, docid, _, _ = a.strip().split() if qid not in pass_dict: