def make_vocab_files(opt, filename, ques_or_ans):
    save_path = os.path.join(root_path, "vocab/%s_vocab.json" % ques_or_ans)
    # load data
    if (ques_or_ans == "question"):
        _, sentence_ls, _ = VQADataProvider.load_raw_iqa(filename)
    elif (ques_or_ans == "answer"):
        _, _, sentence_ls = VQADataProvider.load_raw_iqa(filename)
    else:
        sentence_ls = None
    vocab_dict = make_vocab(sentence_ls)
    # save to json file
    with open(save_path, "w") as f:
        json.dump(vocab_dict, f)
    print("%s-%s vocabulary saved" % (filename, ques_or_ans))
Exemple #2
0
def gen_txt():
    with open(q_voc_path, "r") as f:
        q_dic = json.load(f)

    word_list = []

    exc_list = ["<break>", "<END>", "<START>", "<UNKNOWN>", "<UNK>"]

    for k, _ in q_dic.items():
        # exclude <break>, <END>, <START>, <UNKNOWN>
        if (k not in exc_list):
            word_list.append(k)

    sent_list = []
    _, raw_ques, _ = VQADataProvider.load_raw_iqa(q_a_i_path)
    for ques in raw_ques:
        sent_list.append(VQADataProvider.text_to_list(ques))
    # with open(q_a_i_path, "r") as csvfile:
    # 	# QA = csv.reader(csvfile, delimiter="\t", quotechar='\n')
    # 	for row in QA:
    # 		sent_list.append(data_provider.VQADataProvider.seq_to_list(row[2]))

    sent_idx_list = []
    for sent in sent_list:
        sent_idx_list.append(
            [word_list.index(x) for x in sent if x not in exc_list])

    with open(wordlist_path, "w") as f:
        for item in word_list:
            f.write("%s\n" % item)

    with open(doc_path, "w") as f:
        for sent in sent_idx_list:
            f.write(" ".join([str(i) for i in sent]) + "\n")
def make_ans_vocab_file(opt, filename):
    save_path = os.path.join(root_path, "vocab/answer_vocab.pkl")
    # loada data
    _, _, sentence_ls = VQADataProvider.load_raw_iqa(filename)
    vocab_ls = make_vocab_ans(sentence_ls)
    with open(save_path, "wb") as f:
        pickle.dump(vocab_ls, f)

    #for debug
    print(vocab_ls[:10])

    return len(vocab_ls)
def check_ans_vocab(filename):
    _, _, sentence_ls = VQADataProvider.load_raw_iqa(filename)
    word_fre_dict = {}
    for sent in sentence_ls:
        word_ls = VQADataProvider.text_to_list(sent)
        for word in word_ls:
            if (word in word_fre_dict):
                word_fre_dict[word] += 1
            else:
                word_fre_dict[word] = 1
    # sort
    word_fre_dict = sorted(word_fre_dict.items(),
                           key=lambda kv: kv[1],
                           reverse=True)
    return word_fre_dict