def multichoice_encoding_data(args, vocab, questions, video_names, video_ids, answers, ans_candidates, mode='train'): # Encode all questions print('Encoding data') questions_encoded = [] questions_len = [] question_ids = [] all_answer_cands_encoded = [] all_answer_cands_len = [] video_ids_tbw = [] video_names_tbw = [] correct_answers = [] for idx, question in enumerate(questions): question = question.lower()[:-1] question_tokens = nltk.word_tokenize(question) question_encoded = utils.encode(question_tokens, vocab['question_answer_token_to_idx'], allow_unk=True) questions_encoded.append(question_encoded) questions_len.append(len(question_encoded)) question_ids.append(idx) video_names_tbw.append(video_names[idx]) video_ids_tbw.append(video_ids[idx]) # grounthtruth answer = int(answers[idx]) correct_answers.append(answer) # answer candidates candidates = ans_candidates[idx] candidates_encoded = [] candidates_len = [] for ans in candidates: ans = ans.lower() ans_tokens = nltk.word_tokenize(ans) cand_encoded = utils.encode(ans_tokens, vocab['question_answer_token_to_idx'], allow_unk=True) candidates_encoded.append(cand_encoded) candidates_len.append(len(cand_encoded)) all_answer_cands_encoded.append(candidates_encoded) all_answer_cands_len.append(candidates_len) # Pad encoded questions max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_answer_token_to_idx']['<NULL>']) questions_encoded = np.asarray(questions_encoded, dtype=np.int32) questions_len = np.asarray(questions_len, dtype=np.int32) print(questions_encoded.shape) # Pad encoded answer candidates max_answer_cand_length = max(max(len(x) for x in candidate) for candidate in all_answer_cands_encoded) for ans_cands in all_answer_cands_encoded: for ans in ans_cands: while len(ans) < max_answer_cand_length: ans.append(vocab['question_answer_token_to_idx']['<NULL>']) all_answer_cands_encoded = np.asarray(all_answer_cands_encoded, dtype=np.int32) all_answer_cands_len = np.asarray(all_answer_cands_len, dtype=np.int32) print(all_answer_cands_encoded.shape) glove_matrix = None if mode in ['train']: token_itow = {i: w for w, i in vocab['question_answer_token_to_idx'].items()} print("Load glove from %s" % args.glove_pt) glove = pickle.load(open(args.glove_pt, 'rb')) dim_word = glove['the'].shape[0] glove_matrix = [] for i in range(len(token_itow)): vector = glove.get(token_itow[i], np.zeros((dim_word,))) glove_matrix.append(vector) glove_matrix = np.asarray(glove_matrix, dtype=np.float32) print(glove_matrix.shape) print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode)) obj = { 'questions': questions_encoded, 'questions_len': questions_len, 'question_id': question_ids, 'video_ids': np.asarray(video_ids_tbw), 'video_names': np.array(video_names_tbw), 'ans_candidates': all_answer_cands_encoded, 'ans_candidates_len': all_answer_cands_len, 'answers': correct_answers, 'glove': glove_matrix, } with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f: pickle.dump(obj, f)
def process_questions(args): ''' Encode question tokens''' print('Loading data') with open(args.annotation_file, 'r') as dataset_file: instances = json.load(dataset_file) # Either create the vocab or load it from disk if args.mode in ['train']: print('Building vocab') answer_cnt = {} for instance in instances: answer = instance['answer'] answer_cnt[answer] = answer_cnt.get(answer, 0) + 1 answer_token_to_idx = {'<UNK0>': 0, '<UNK1>': 1} answer_counter = Counter(answer_cnt) frequent_answers = answer_counter.most_common(args.answer_top) total_ans = sum(item[1] for item in answer_counter.items()) total_freq_ans = sum(item[1] for item in frequent_answers) print("Number of unique answers:", len(answer_counter)) print("Total number of answers:", total_ans) print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans)) for token, cnt in Counter(answer_cnt).most_common(args.answer_top): answer_token_to_idx[token] = len(answer_token_to_idx) print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx)) question_token_to_idx = {'<NULL>': 0, '<UNK>': 1} for i, instance in enumerate(instances): question = instance['question'].lower()[:-1] for token in nltk.word_tokenize(question): if token not in question_token_to_idx: question_token_to_idx[token] = len(question_token_to_idx) print('Get question_token_to_idx') print(len(question_token_to_idx)) vocab = { 'question_token_to_idx': question_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, 'question_answer_token_to_idx': { '<NULL>': 0, '<UNK>': 1 } } print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset)) with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f: json.dump(vocab, f, indent=4) else: print('Loading vocab') with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f: vocab = json.load(f) # Encode all questions print('Encoding data') questions_encoded = [] questions_len = [] question_ids = [] video_ids_tbw = [] video_names_tbw = [] all_answers = [] for idx, instance in enumerate(instances): question = instance['question'].lower()[:-1] question_tokens = nltk.word_tokenize(question) question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) questions_encoded.append(question_encoded) questions_len.append(len(question_encoded)) question_ids.append(idx) im_name = instance['video_id'] video_ids_tbw.append(im_name) video_names_tbw.append(im_name) if instance['answer'] in vocab['answer_token_to_idx']: answer = vocab['answer_token_to_idx'][instance['answer']] elif args.mode in ['train']: answer = 0 elif args.mode in ['val', 'test']: answer = 1 all_answers.append(answer) max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) questions_encoded = np.asarray(questions_encoded, dtype=np.int32) questions_len = np.asarray(questions_len, dtype=np.int32) print(questions_encoded.shape) glove_matrix = None if args.mode == 'train': token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()} print("Load glove from %s" % args.glove_pt) with open(args.glove_pt, 'rb') as f: glove = pickle.load(f) dim_word = glove['the'].shape[0] glove_matrix = [] for i in range(len(token_itow)): vector = glove.get(token_itow[i], np.zeros((dim_word, ))) glove_matrix.append(vector) glove_matrix = np.asarray(glove_matrix, dtype=np.float32) print(glove_matrix.shape) print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode)) obj = { 'questions': questions_encoded, 'questions_len': questions_len, 'question_id': question_ids, 'video_ids': np.asarray(video_ids_tbw), 'video_names': np.array(video_names_tbw), 'answers': all_answers, 'glove': glove_matrix, } with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f: pickle.dump(obj, f)
def openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='train'): ''' Encode question tokens''' print('Encoding data') questions_encoded = [] questions_len = [] video_ids_tbw = [] video_names_tbw = [] all_answers = [] question_ids = [] for idx, question in enumerate(questions): question = question.lower()[:-1] question_tokens = nltk.word_tokenize(question) question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) questions_encoded.append(question_encoded) questions_len.append(len(question_encoded)) question_ids.append(idx) video_names_tbw.append(video_names[idx]) video_ids_tbw.append(video_ids[idx]) if args.question_type == "frameqa": answer = answers[idx] if answer in vocab['answer_token_to_idx']: answer = vocab['answer_token_to_idx'][answer] elif mode in ['train']: answer = 0 elif mode in ['val', 'test']: answer = 1 else: answer = max(int(answers[idx]), 1) all_answers.append(answer) # Pad encoded questions max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) questions_encoded = np.asarray(questions_encoded, dtype=np.int32) questions_len = np.asarray(questions_len, dtype=np.int32) print(questions_encoded.shape) glove_matrix = None if mode == 'train': token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()} print("Load glove from %s" % args.glove_pt) glove = pickle.load(open(args.glove_pt, 'rb')) dim_word = glove['the'].shape[0] glove_matrix = [] for i in range(len(token_itow)): vector = glove.get(token_itow[i], np.zeros((dim_word,))) glove_matrix.append(vector) glove_matrix = np.asarray(glove_matrix, dtype=np.float32) print(glove_matrix.shape) print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode)) obj = { 'questions': questions_encoded, 'questions_len': questions_len, 'question_id': question_ids, 'video_ids': np.asarray(video_ids_tbw), 'video_names': np.array(video_names_tbw), 'answers': all_answers, 'glove': glove_matrix, } with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f: pickle.dump(obj, f)