# TODO: In maml, all data points in 944K training dataset will be used. So it is much better to use the dict of 944K training the model from scratch. # # List of (question, {question information and answer}) pairs, the training pairs are in format of 1:1. phrase_pairs, emb_dict = data.load_data_MAML( QUESTION_PATH=TRAIN_QUESTION_ANSWER_PATH, DIC_PATH=DIC_PATH, max_tokens=MAX_TOKENS) log.info("Obtained %d phrase pairs with %d uniq words from %s.", len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_ANSWER_PATH) phrase_pairs_944K = data.load_data_MAML( QUESTION_PATH=TRAIN_944K_QUESTION_ANSWER_PATH, max_tokens=MAX_TOKENS) log.info("Obtained %d phrase pairs from %s.", len(phrase_pairs_944K), TRAIN_944K_QUESTION_ANSWER_PATH) data.save_emb_dict(saves_path, emb_dict) end_token = emb_dict[data.END_TOKEN] # Transform token into index in dictionary. train_data = data.encode_phrase_pairs_RLTR(phrase_pairs, emb_dict) # # list of (seq1, [seq*]) pairs,把训练对做成1:N的形式; # train_data = data.group_train_data(train_data) train_data = data.group_train_data_RLTR(train_data) train_data_944K = data.encode_phrase_pairs_RLTR(phrase_pairs_944K, emb_dict) train_data_944K = data.group_train_data_RLTR_for_support(train_data_944K) dict944k = data.get944k(DICT_944K) log.info("Reading dict944k from %s is done. %d pairs in dict944k.", DICT_944K, len(dict944k)) dict944k_weak = data.get944k(DICT_944K_WEAK) log.info( "Reading dict944k_weak from %s is done. %d pairs in dict944k_weak", DICT_944K_WEAK, len(dict944k_weak))
def establish_positive_question_documents_pair(MAX_TOKENS): # Dict: word token -> ID. docID_dict, _ = data.get_docID_indices( data.get_ordered_docID_document(ORDERED_QID_QUESTION_DICT)) # Index -> qid. rev_docID_dict = {id: doc for doc, id in docID_dict.items()} # # List of (question, {question information and answer}) pairs, the training pairs are in format of 1:1. phrase_pairs, emb_dict = data.load_data_MAML(TRAIN_QUESTION_ANSWER_PATH, DIC_PATH, MAX_TOKENS) print("Obtained %d phrase pairs with %d uniq words from %s." % (len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_ANSWER_PATH)) phrase_pairs_944K = data.load_data_MAML(TRAIN_944K_QUESTION_ANSWER_PATH, max_tokens=MAX_TOKENS) print("Obtained %d phrase pairs from %s." % (len(phrase_pairs_944K), TRAIN_944K_QUESTION_ANSWER_PATH)) # Transform token into index in dictionary. train_data = data.encode_phrase_pairs_RLTR(phrase_pairs, emb_dict) # train_data = data.group_train_data(train_data) train_data = data.group_train_data_RLTR(train_data) train_data_944K = data.encode_phrase_pairs_RLTR(phrase_pairs_944K, emb_dict) train_data_944K = data.group_train_data_RLTR_for_support(train_data_944K) dict944k = data.get944k(DICT_944K) print("Reading dict944k from %s is done. %d pairs in dict944k." % (DICT_944K, len(dict944k))) dict944k_weak = data.get944k(DICT_944K_WEAK) print("Reading dict944k_weak from %s is done. %d pairs in dict944k_weak" % (DICT_944K_WEAK, len(dict944k_weak))) metaLearner = metalearner.MetaLearner( samples=5, train_data_support_944K=train_data_944K, dict=dict944k, dict_weak=dict944k_weak, steps=5, weak_flag=True) question_doctments_pair_list = {} idx = 0 for temp_batch in data.iterate_batches(train_data, 1): task = temp_batch[0] if len(task) == 2 and 'qid' in task[1]: # print("Task %s is training..." %(str(task[1]['qid']))) # Establish support set. support_set = metaLearner.establish_support_set( task, metaLearner.steps, metaLearner.weak_flag, metaLearner.train_data_support_944K) documents = [] if len(support_set) > 0: for support_sample in support_set: if len(support_sample) == 2 and 'qid' in support_sample[1]: documents.append(support_sample[1]['qid']) else: print('task %s has no support set!' % (str(task[1]['qid']))) documents.append(task[1]['qid']) question_doctments_pair_list[task[1]['qid']] = documents if idx % 100 == 0: print(idx) idx += 1 else: print('task has no qid or len(task)!=2:') print(task) fw = open('../data/auto_QA_data/retriever_question_documents_pair.json', 'w', encoding="UTF-8") fw.writelines( json.dumps(question_doctments_pair_list, indent=1, ensure_ascii=False)) fw.close() print('Writing retriever_question_documents_pair.json is done!')
os.makedirs(saves_path, exist_ok=True) # TODO: In maml, all data points in 944K training dataset will be used. So it is much better to use the dict of 944K training the model from scratch. # # List of (question, {question information and answer}) pairs, the training pairs are in format of 1:1. phrase_pairs, emb_dict = data.load_data_MAML(TRAIN_QUESTION_ANSWER_PATH, DIC_PATH, MAX_TOKENS) log.info("Obtained %d phrase pairs with %d uniq words from %s.", len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_ANSWER_PATH) phrase_pairs_webqsp = data.load_data_MAML( TRAIN_WEBQSP_QUESTION_ANSWER_PATH, max_tokens=MAX_TOKENS) log.info("Obtained %d phrase pairs from %s.", len(phrase_pairs_webqsp), TRAIN_WEBQSP_QUESTION_ANSWER_PATH) data.save_emb_dict(saves_path, emb_dict) end_token = emb_dict[data.END_TOKEN] # Transform token into index in dictionary. train_data = data.encode_phrase_pairs_RLTR(phrase_pairs, emb_dict) # # list of (seq1, [seq*]) pairs,把训练对做成1:N的形式; # train_data = data.group_train_data(train_data) train_data = data.group_train_data_RLTR(train_data) train_data_webqsp = data.encode_phrase_pairs_RLTR(phrase_pairs_webqsp, emb_dict) train_data_webqsp = data.group_train_data_RLTR_for_support( train_data_webqsp) dictwebqsp = data.get_webqsp(DICT_WEBQSP) log.info("Reading dict_webqsp from %s is done. %d pairs in dict_webqsp.", DICT_WEBQSP, len(dictwebqsp)) dictwebqsp_weak = data.get_webqsp(DICT_WEBQSP_WEAK) log.info( "Reading dict_webqsp_weak from %s is done. %d pairs in dict_webqsp_weak",
if args.dataset == "csqa": phrase_pairs, emb_dict = data.load_RL_data_TR( TRAIN_QUESTION_ANSWER_PATH, DIC_PATH, MAX_TOKENS) else: phrase_pairs, emb_dict = data.load_RL_data_TR( TRAIN_QUESTION_ANSWER_PATH_WEBQSP, DIC_PATH_WEBQSP, MAX_TOKENS) log.info( "Obtained %d phrase pairs with %d uniq words from %s without INT mask information.", len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_ANSWER_PATH) # Index -> word. rev_emb_dict = {idx: word for word, idx in emb_dict.items()} end_token = emb_dict[data.END_TOKEN] # 将tokens转换为emb_dict中的indices; test_data = data.encode_phrase_pairs_RLTR(phrase_pairs, emb_dict) net = model.PhraseModel(emb_size=model.EMBEDDING_DIM, dict_size=len(emb_dict), hid_size=model.HIDDEN_STATE_SIZE, LSTM_FLAG=args.lstm, ATT_FLAG=args.att) net = net.cuda() # model_path = '../data/saves/rl_even_adaptive_1%/' + str(args.name) + '/' + str(args.model) model_path = '../data/saves/webqsp0517/' + str(args.name) + '/' + str( args.model) net.load_state_dict((torch.load(model_path))) end_token = emb_dict[data.END_TOKEN] true_reward_test = run_test(test_data, net, rev_emb_dict, end_token, device)
os.makedirs(saves_path, exist_ok=True) # TODO: In maml, all data points in WEBQSP training dataset will be used. So it is much better to use the dict of WEBQSP training the model from scratch. # # List of (question, {question information and answer}) pairs, the training pairs are in format of 1:1. phrase_pairs, emb_dict = data.load_data_MAML(TRAIN_QUESTION_ANSWER_PATH, DIC_PATH, MAX_TOKENS) log.info("Obtained %d phrase pairs with %d uniq words from %s.", len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_ANSWER_PATH) phrase_pairs_WEBQSP = data.load_data_MAML( TRAIN_WEBQSP_QUESTION_ANSWER_PATH, max_tokens=MAX_TOKENS) log.info("Obtained %d phrase pairs from %s.", len(phrase_pairs_WEBQSP), TRAIN_WEBQSP_QUESTION_ANSWER_PATH) data.save_emb_dict(saves_path, emb_dict) end_token = emb_dict[data.END_TOKEN] # Transform token into index in dictionary. train_data = data.encode_phrase_pairs_RLTR(phrase_pairs, emb_dict) # # list of (seq1, [seq*]) pairs,把训练对做成1:N的形式; # train_data = data.group_train_data(train_data) train_data = data.group_train_data_RLTR(train_data) train_data_WEBQSP = data.encode_phrase_pairs_RLTR(phrase_pairs_WEBQSP, emb_dict) train_data_WEBQSP = data.group_train_data_RLTR_for_support( train_data_WEBQSP) dictwebqsp = data.get_webqsp(DICT_WEBQSP) log.info("Reading dict_webqsp from %s is done. %d pairs in dict_webqsp.", DICT_WEBQSP, len(dictwebqsp)) dictwebqsp_weak = data.get_webqsp(DICT_WEBQSP_WEAK) log.info( "Reading dict_webqsp_weak from %s is done. %d pairs in dict_webqsp_weak",