Exemple #1
0
 def get_tokenizer(pretrained_model_name_or_path: str,
                   *args,
                   batch_size: int = 8,
                   **kwargs) -> T5BatchTokenizer:
     return T5BatchTokenizer(AutoTokenizer.from_pretrained(
         pretrained_model_name_or_path, use_fast=False, *args, **kwargs),
                             batch_size=batch_size)
Exemple #2
0
def construct_t5(options: PassageRankingEvaluationOptions) -> Reranker:
    device = torch.device(options.device)
    model = T5ForConditionalGeneration.from_pretrained(
        options.model, from_tf=options.from_tf).to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(options.model_type)
    tokenizer = T5BatchTokenizer(tokenizer, options.batch_size)
    return T5Reranker(model, tokenizer)
Exemple #3
0
 def build_ranker(self) -> T5Reranker:
     loader = CachedT5ModelLoader(settings.t5_model_dir, settings.cache_dir,
                                  'ranker', settings.t5_model_type,
                                  settings.flush_cache)
     device = torch.device(settings.t5_device)
     model = loader.load().to(device).eval()
     tokenizer = T5Tokenizer.from_pretrained(settings.t5_model_type)
     batch_tokenizer = T5BatchTokenizer(tokenizer, settings.t5_batch_size)
     return T5Reranker(model, batch_tokenizer)
Exemple #4
0
def construct_t5(options: KaggleEvaluationOptions) -> Reranker:
    loader = CachedT5ModelLoader(SETTINGS.t5_model_dir, SETTINGS.cache_dir,
                                 'ranker', SETTINGS.t5_model_type,
                                 SETTINGS.flush_cache)
    device = torch.device(options.device)
    model = loader.load().to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(
        options.model_name, do_lower_case=options.do_lower_case)
    tokenizer = T5BatchTokenizer(tokenizer, options.batch_size)
    return T5Reranker(model, tokenizer)
Exemple #5
0
def construct_t5(options: PassageRankingEvaluationOptions) -> Reranker:
    loader = CachedT5ModelLoader(options.model_name_or_path,
                                 SETTINGS.cache_dir,
                                 'ranker',
                                 options.model_type,
                                 SETTINGS.flush_cache)
    device = torch.device(options.device)
    model = loader.load().to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(options.model_type)
    tokenizer = T5BatchTokenizer(tokenizer, options.batch_size)
    return T5Reranker(model, tokenizer)
Exemple #6
0
    def __init__(
        self,
        model_name_or_instance: Union[
            str, T5ForConditionalGeneration] = 'castorini/monoT5-base-msmarco',
        tokenizer_name_or_instance: Union[
            str, QueryDocumentBatchTokenizer] = 't5-base'):
        if isinstance(model_name_or_instance, str):
            device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu')
            model_name_or_instance = T5ForConditionalGeneration.from_pretrained(
                model_name_or_instance).to(device).eval()
        self.model = model_name_or_instance

        if isinstance(tokenizer_name_or_instance, str):
            tokenizer_name_or_instance = T5BatchTokenizer(
                AutoTokenizer.from_pretrained(tokenizer_name_or_instance),
                batch_size=8)
        self.tokenizer = tokenizer_name_or_instance

        self.device = next(self.model.parameters(), None).device
Exemple #7
0
 def get_tokenizer(pretrained_model_name_or_path: str = 't5-base',
                   *args, batch_size: int = 8, **kwargs) -> T5BatchTokenizer:
     return T5BatchTokenizer(
         AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *args, **kwargs),
         batch_size=batch_size
     )
Exemple #8
0
    parser.add_argument('--input_file', type=str, required=True)
    parser.add_argument('--query_field', type=str, required=True)
    args = parser.parse_args()

    model_name = 'castorini/monot5-base-msmarco'
    tokenizer_name = 't5-base'
    batch_size = 8

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = T5ForConditionalGeneration.from_pretrained(model_name)
    model = model.to(device).eval()

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokenizer = T5BatchTokenizer(tokenizer, batch_size)
    reranker = T5Reranker(model, tokenizer)

    query_dict = {}
    with open("train_queries.json") as f:
        for line in f:
            temp = json.loads(line.strip())
            query_dict[temp['qid']] = temp[args.query_field]
    print(len(query_dict))

    pass_dict = {}
    with open(f"{args.input_file}.trec_json") as f:
        for line in f:
            a, b = line.strip().split("indri #")
            qid, _, docid, _, _ = a.strip().split()
            if qid not in pass_dict: