コード例 #1
0
from tag_mspan_robert_gcn.drop_roberta_mspan_dataset import DropReader as TDropReader

parser = argparse.ArgumentParser()
parser.add_argument("--input_path", type=str)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--passage_length_limit", type=int, default=463)
parser.add_argument("--question_length_limit", type=int, default=46)
parser.add_argument("--tag_mspan", action="store_true")

args = parser.parse_args()

tokenizer = RobertaTokenizer.from_pretrained(args.input_path +
                                             "/roberta.large")

if args.tag_mspan:
    dev_reader = TDropReader(tokenizer, args.passage_length_limit,
                             args.question_length_limit)

    train_reader = TDropReader(tokenizer,
                               args.passage_length_limit,
                               args.question_length_limit,
                               skip_when_all_empty=[
                                   "passage_span", "question_span",
                                   "addition_subtraction", "counting",
                                   "multi_span"
                               ])

    data_format = "drop_dataset_{}.json"

    data_mode = ["train"]
    for dm in data_mode:
        dpath = os.path.join(args.input_path, data_format.format(dm))
コード例 #2
0
                use_gcn=args.use_gcn,
                gcn_steps=args.gcn_steps)

if args.cuda: network.cuda()
print("Load from pre path {}.".format(args.pre_path))
network.load_state_dict(torch.load(args.pre_path))

print("Load data from {}.".format(args.inf_path))
if args.eng != 0:
    tokenizer = RobertaTokenizer.from_pretrained(args.roberta_model)
else:
    # import pdb; pdb.set_trace()
    tokenizer = AutoTokenizer.from_pretrained(args.roberta_model)
if args.tag_mspan:
    inf_iter = TDropBatchGen(args, tokenizer,
                            TDropReader(tokenizer, passage_length_limit=463, question_length_limit=46, is_eng=args.eng)
                                ._read(args.inf_path))
else:
    inf_iter = DropBatchGen(args, tokenizer, DropReader(tokenizer, passage_length_limit=463, question_length_limit=46)._read(args.inf_path))

print("Start inference...")
result = {}
network.eval()
# myf = open(args.dump_path, 'w', encoding="utf8")
# myf.close()
# myf = open(args.dump_path, 'a', encoding="utf8")
with torch.no_grad():
    for batch in tqdm(inf_iter):
        output_dict = network(**batch)
        # json.dump(output_dict, myf, ensure_ascii=False)
        # myf.write('\n')
        for i in range(len(output_dict["question_id"])):