def QueryPreprocessingFn(args, line, tokenizer):
    line_arr = line.split('\t')
    q_id = int(line_arr[0])
    # if 'fairseq' in args.model_type:
    #     passage=tokenizer.encode(full_text)
    # else:
    #     passage = tokenizer.encode(
    #         line_arr[1].rstrip(),
    #         add_special_tokens=True,
    #         max_length=args.max_query_length)
    if 'fairseq' not in args.train_model_type:
        passage  = tokenizer.encode(
            line_arr[1].rstrip(), add_special_tokens=True, max_length=args.max_seq_length,)
        pad_token_id=tokenizer.pad_token_id
    elif 'fast' in args.train_model_type:
        full_text=line_arr[1].rstrip().lower()
        passage =tokenizer.encode(full_text, add_special_tokens=True).ids[:args.max_seq_length]
        pad_token_id=1
    else:
        full_text=line_arr[1].rstrip().lower()
        passage =list(np.array( tokenizer.encode(full_text)[:args.max_seq_length]))
        pad_token_id=1

    passage_len = min(len(passage), args.max_query_length)
    input_id_b = pad_input_ids(passage, args.max_query_length,pad_token=pad_token_id)

    return q_id.to_bytes(8,'big') + passage_len.to_bytes(4,'big') + np.array(input_id_b,np.int32).tobytes()
Beispiel #2
0
def PassagePreprocessingFn(args, line, tokenizer):
    if args.data_type == 0:
        line_arr = line.split('\t')
        p_id = int(line_arr[0][1:])  # remove "D"

        url = line_arr[1].rstrip()
        title = line_arr[2].rstrip()
        p_text = line_arr[3].rstrip()

        full_text = url + "<sep>" + title + "<sep>" + p_text
        # keep only first 10000 characters, should be sufficient for any
        # experiment that uses less than 500 - 1k tokens
        full_text = full_text[:args.max_doc_character]
    else:
        line = line.strip()
        line_arr = line.split('\t')
        p_id = int(line_arr[0])

        p_text = line_arr[1].rstrip()

        # keep only first 10000 characters, should be sufficient for any
        # experiment that uses less than 500 - 1k tokens
        full_text = p_text[:args.max_doc_character]

    passage = tokenizer.encode(
        full_text,
        add_special_tokens=True,
        max_length=args.max_seq_length,
    )
    passage_len = min(len(passage), args.max_seq_length)
    input_id_b = pad_input_ids(passage, args.max_seq_length)

    return p_id.to_bytes(8, 'big') + passage_len.to_bytes(4, 'big') + np.array(
        input_id_b, np.int32).tobytes()
def PassagePreprocessingFn(args, line, tokenizer):
    if args.data_type == 0:
        line_arr = line.split('\t')
        p_id = int(line_arr[0][1:])  # remove "D"

        url = line_arr[1].rstrip()
        title = line_arr[2].rstrip()
        p_text = line_arr[3].rstrip()

        if 'fast' in args.train_model_type:
            full_text = url.lower() + " [SEP] " + title.lower() + " [SEP] " + p_text.lower()
        elif 'fairseq' in args.train_model_type:
            full_text = url + " </s> " + title + " </s> " + p_text
        else:
            full_text = url + " "+tokenizer.sep_token_id+" " + title + " "+tokenizer.sep_token_id+" " + p_text
        # keep only first 10000 characters, should be sufficient for any
        # experiment that uses less than 500 - 1k tokens
        full_text = full_text[:args.max_doc_character]
    else:
        line = line.strip()
        line_arr = line.split('\t')
        p_id = int(line_arr[0])

        p_text = line_arr[1].rstrip()

        # keep only first 10000 characters, should be sufficient for any
        # experiment that uses less than 500 - 1k tokens
        full_text = p_text[:args.max_doc_character]

    # if 'fairseq' in args.model_type:
    #     passage=tokenizer.encode(full_text)
    # else:
    #     passage = tokenizer.encode(
    #         full_text,
    #         add_special_tokens=True,
    #         max_length=args.max_seq_length,
    #     )
    if 'fairseq' not in args.train_model_type:
        passage  = tokenizer.encode(
            full_text, add_special_tokens=True, max_length=args.max_seq_length,)
        pad_token_id=tokenizer.pad_token_id
    elif 'fast' in args.train_model_type:
        if args.data_type == 1:
            full_text=full_text.lower()
        # else:
        #     full_text=full_text.lower()
        #full_text = url.lower() + "[SEP]" + title.lower() + "[SEP]" + p_text.lower()
        #full_text=full_text.lower()
        passage =tokenizer.encode(full_text, add_special_tokens=True).ids[:args.max_seq_length]
        pad_token_id=1
    else:
        full_text=full_text.lower()
        passage =list(np.array( tokenizer.encode(full_text)[:args.max_seq_length]))
        pad_token_id=1
    passage_len = min(len(passage), args.max_seq_length)
    input_id_b = pad_input_ids(passage, args.max_seq_length,pad_token=pad_token_id)

    return p_id.to_bytes(8,'big') + passage_len.to_bytes(4,'big') + np.array(input_id_b,np.int32).tobytes()
Beispiel #4
0
def QueryPreprocessingFn(args, line, tokenizer):
    line_arr = line.split('\t')
    q_id = int(line_arr[0])

    passage = tokenizer.encode(line_arr[1].rstrip(),
                               add_special_tokens=True,
                               max_length=args.max_query_length)
    passage_len = min(len(passage), args.max_query_length)
    input_id_b = pad_input_ids(passage, args.max_query_length)

    return q_id.to_bytes(8, 'big') + passage_len.to_bytes(4, 'big') + np.array(
        input_id_b, np.int32).tobytes()