def sim_test(args): first_dir = args.first_dir second_dir = args.second_dir first_sents_path = os.path.join(first_dir, "sents.json") second_sents_path = os.path.join(second_dir, "sents.json") vocab_path = os.path.join(first_dir, "vocab.json") vocab = json.load(open(vocab_path, 'r')) inv_vocab = {idx: word for word, idx in vocab.items()} first_sents = json.load(open(first_sents_path, "r")) second_sents = json.load(open(second_sents_path, "r")) diff_dict = defaultdict(int) pbar = get_pbar(len(first_sents)).start() i = 0 for first_id, sents1 in first_sents.items(): text1 = sent_to_text(inv_vocab, sents1[0]) min_second_id, diff = min( [[second_id, cdiff(sents1, sents2, len(vocab))] for second_id, sents2 in second_sents.items()], key=lambda x: x[1]) text2 = sent_to_text(inv_vocab, second_sents[min_second_id][0]) diff_dict[diff] += 1 """ if diff <= 3: print("%s, %s, %d" % (text1, text2, diff)) """ pbar.update(i) i += 1 pbar.finish() json.dump(diff_dict, open("diff_dict.json", "w"))
def prepro_annos(args): """ Transform DQA annotation.json -> a list of tokenized fact sentences for each image in json file The facts are indexed by image id. :param args: :return: """ data_dir = args.data_dir target_dir = args.target_dir # For debugging if args.debug == 'True': sents_path =os.path.join(target_dir, "raw_sents.json") answers_path =os.path.join(target_dir, "answers.json") sentss_dict = json.load(open(sents_path, 'r')) answers_dict = json.load(open(answers_path, 'r')) facts_path = os.path.join(target_dir, "raw_facts.json") meta_data_path = os.path.join(target_dir, "meta_data.json") meta_data = json.load(open(meta_data_path, "r")) facts_dict = {} annos_dir = os.path.join(data_dir, "annotations") anno_names = [name for name in os.listdir(annos_dir) if name.endswith(".json")] max_num_facts = 0 max_fact_size = 0 pbar = get_pbar(len(anno_names)).start() for i, anno_name in enumerate(anno_names): image_name, _ = os.path.splitext(anno_name) image_id, _ = os.path.splitext(image_name) anno_path = os.path.join(annos_dir, anno_name) anno = json.load(open(anno_path, 'r')) rels = anno2rels(anno) id_map = _get_id_map(anno) text_facts = [rel2text(id_map, rel) for rel in rels] text_facts = list(set(_tokenize(fact) for fact in text_facts if fact is not None)) max_fact_size = max([max_fact_size] + [len(fact) for fact in text_facts]) # For debugging only if args.debug == 'True': if image_id in sentss_dict: correct_sents = [sents[answer] for sents, answer in zip(sentss_dict[image_id], answers_dict[image_id])] # indexed_facts.extend(correct_sents) # FIXME : this is very strong prior! text_facts = correct_sents else: text_facts = [] facts_dict[image_id] = text_facts max_num_facts = max(max_num_facts, len(text_facts)) pbar.update(i) pbar.finish() meta_data['max_num_facts'] = max_num_facts meta_data['max_fact_size'] = max_fact_size print("number of facts: %d" % sum(len(facts) for facts in facts_dict.values())) print("max num facts per relation: %d" % max_num_facts) print("max fact size: %d" % max_fact_size) print("dumping json files ... ") json.dump(meta_data, open(meta_data_path, 'w')) json.dump(facts_dict, open(facts_path, 'w')) print("done")
def sim_test(args): first_dir = args.first_dir second_dir = args.second_dir first_sents_path = os.path.join(first_dir, "sents.json") second_sents_path = os.path.join(second_dir, "sents.json") vocab_path = os.path.join(first_dir, "vocab.json") vocab = json.load(open(vocab_path, 'r')) inv_vocab = {idx: word for word, idx in vocab.items()} first_sents = json.load(open(first_sents_path, "r")) second_sents = json.load(open(second_sents_path, "r")) diff_dict = defaultdict(int) pbar = get_pbar(len(first_sents)).start() i = 0 for first_id, sents1 in first_sents.items(): text1 = sent_to_text(inv_vocab, sents1[0]) min_second_id, diff = min([[second_id, cdiff(sents1, sents2, len(vocab))] for second_id, sents2 in second_sents.items()], key=lambda x: x[1]) text2 = sent_to_text(inv_vocab, second_sents[min_second_id][0]) diff_dict[diff] += 1 """ if diff <= 3: print("%s, %s, %d" % (text1, text2, diff)) """ pbar.update(i) i += 1 pbar.finish() json.dump(diff_dict, open("diff_dict.json", "w"))
def split_dqa(args): data_dir = args.data_dir first_dir = args.first_dir second_dir = args.second_dir if not os.path.exists(first_dir): os.mkdir(first_dir) if second_dir and not os.path.exists(second_dir): os.mkdir(second_dir) num = args.num image_names = [name for name in os.listdir(os.path.join(data_dir, "images")) if name.endswith(".png") and args.start <= int(os.path.splitext(name)[0]) < args.stop] image_names = sorted(image_names, key=lambda x: int(os.path.splitext(x)[0])) if args.random == 'True': random.shuffle(image_names) if num: pbar = get_pbar(len(image_names)).start() for i, image_name in enumerate(image_names): image_id, ext = os.path.splitext(image_name) json_name = "%s.json" % image_name if i < num: to_dir = first_dir elif second_dir: to_dir = second_dir else: pbar.update(i) continue subdirs = ['images', 'annotations', 'questions'] if args.label == 'True': subdirs.append('imagesReplacedText') for subdir in subdirs: folder_path = os.path.join(to_dir, subdir) if not os.path.exists(folder_path): os.mkdir(folder_path) if args.skip_images == 'False': subdirs = ['images'] if args.label == 'True': subdirs.append('imagesReplacedText') for subdir in subdirs: _copy(data_dir, to_dir, image_name, subdir=subdir) _copy(data_dir, to_dir, json_name, subdir='annotations') if args.label == 'True': _copy(data_dir, to_dir, json_name, subdir='questions') else: question_path = os.path.join(data_dir, 'questions', json_name) if os.path.exists(question_path): question_json = json.load(open(question_path, 'rb')) keys = question_json['questions'].keys() for key in keys: if question_json['questions'][key]['abcLabel']: del question_json['questions'][key] json.dump(question_json, open(os.path.join(to_dir, 'questions', json_name), 'wb')) pbar.update(i) pbar.finish() else: raise Exception() _copy(data_dir, first_dir, "categories.json") _copy(data_dir, second_dir, "categories.json")
def evaluate(anno_dict, questions_dict, choicess_dict, answers_dict): total = 0 correct = 0 incorrect = 0 guessed = 0 pbar = get_pbar(len(anno_dict)).start() for i, (image_id, anno) in enumerate(anno_dict.items()): graph = create_graph(anno) questions = questions_dict[image_id] choicess = choicess_dict[image_id] answers = answers_dict[image_id] for question, choices, answer in zip(questions, choicess, answers): total += 1 a = guess(graph, question, choices) if a is None: guessed += 1 elif answer == a: correct += 1 else: incorrect += 1 pbar.update(i) pbar.finish() print("expected accuracy: (0.25 * %d + %d)/%d = %.4f" % (guessed, correct, total, (0.25 * guessed + correct) / total)) print("precision: %d/%d = %.4f" % (correct, correct + incorrect, correct / (correct + incorrect)))
def prepro_annos(args): """ Transform DQA annotation.json -> a list of tokenized fact sentences for each image in json file The facts are indexed by image id. :param args: :return: """ data_dir = args.data_dir target_dir = args.target_dir # For debugging if args.debug == 'True': sents_path =os.path.join(target_dir, "raw_sents.json") answers_path =os.path.join(target_dir, "answers.json") sentss_dict = json.load(open(sents_path, 'r')) answers_dict = json.load(open(answers_path, 'r')) facts_path = os.path.join(target_dir, "raw_facts.json") meta_data_path = os.path.join(target_dir, "meta_data.json") meta_data = json.load(open(meta_data_path, "r")) facts_dict = {} annos_dir = os.path.join(data_dir, "predictions_052716/dpgs/") anno_names = [name for name in os.listdir(annos_dir) if name.endswith(".json")] max_num_facts = 0 max_fact_size = 0 pbar = get_pbar(len(anno_names)).start() for i, anno_name in enumerate(anno_names): image_name, _ = os.path.splitext(anno_name) image_id, _ = os.path.splitext(image_name) anno_path = os.path.join(annos_dir, anno_name) anno = json.load(open(anno_path, 'r'))["0"] rels = anno2rels(anno) id_map = _get_id_map(anno) text_facts = [rel2text(id_map, rel) for rel in rels] text_facts = list(set(_tokenize(fact) for fact in text_facts if fact is not None)) max_fact_size = max([max_fact_size] + [len(fact) for fact in text_facts]) # For debugging only if args.debug == 'True': if image_id in sentss_dict: correct_sents = [sents[answer] for sents, answer in zip(sentss_dict[image_id], answers_dict[image_id])] # indexed_facts.extend(correct_sents) # FIXME : this is very strong prior! text_facts = correct_sents else: text_facts = [] facts_dict[image_id] = text_facts max_num_facts = max(max_num_facts, len(text_facts)) pbar.update(i) pbar.finish() meta_data['max_num_facts'] = max_num_facts meta_data['max_fact_size'] = max_fact_size print("number of facts: %d" % sum(len(facts) for facts in facts_dict.values())) print("max num facts per relation: %d" % max_num_facts) print("max fact size: %d" % max_fact_size) print("dumping json files ... ") json.dump(meta_data, open(meta_data_path, 'w')) json.dump(facts_dict, open(facts_path, 'w')) print("done")
def interpret_relations(args): prepro_dir = args.prepro_dir meta_data_dir = os.path.join(prepro_dir, "meta_data.json") meta_data = json.load(open(meta_data_dir, "r")) data_dir = meta_data['data_dir'] images_dir = os.path.join(data_dir, 'images') annos_dir = os.path.join(data_dir, 'annotations') html_path = args.html_path sents_path = os.path.join(prepro_dir, 'sents.json') relations_path = os.path.join(prepro_dir, 'relations.json') vocab_path = os.path.join(prepro_dir, 'vocab.json') answers_path = os.path.join(prepro_dir, 'answers.json') sentss_dict = json.load(open(sents_path, "r")) relations_dict = json.load(open(relations_path, "r")) vocab = json.load(open(vocab_path, "r")) answers_dict = json.load(open(answers_path, "r")) decoder = {idx: word for word, idx in vocab.items()} headers = ['iid', 'qid', 'image', 'sents', 'answer', 'annotations', 'relations'] rows = [] pbar = get_pbar(len(sentss_dict)).start() image_ids = sorted(sentss_dict.keys(), key=lambda x: int(x)) for i, image_id in enumerate(image_ids): sentss = sentss_dict[image_id] answers = answers_dict[image_id] relations = relations_dict[image_id] decoded_relations = [_decode_relation(decoder, relation) for relation in relations] for question_id, (sents, answer) in enumerate(zip(sentss, answers)): image_name = "%s.png" % image_id json_name = "%s.json" % image_name image_path = os.path.join(images_dir, image_name) anno_path = os.path.join(annos_dir, json_name) row = {'image_id': image_id, 'question_id': question_id, 'image_url': image_path, 'anno_url': anno_path, 'sents': [_decode_sent(decoder, sent) for sent in sents], 'answer': answer, 'relations': decoded_relations} rows.append(row) pbar.update(i) pbar.finish() var_dict = {'title': "Question List: %d - %d" % (args.start, args.stop - 1), 'image_width': args.im_width, 'headers': headers, 'rows': rows, 'show_im': True if args.show_im == 'True' else False} cur_dir = os.path.dirname(os.path.realpath(__file__)) templates_dir = os.path.join(cur_dir, 'templates') env = Environment(loader=FileSystemLoader(templates_dir)) template = env.get_template(args.template_name) out = template.render(**var_dict) with open(html_path, "w") as f: f.write(out) os.system("open %s" % html_path)
def prepro_questions(args): """ transform DQA questions.json files -> single statements json and single answers json. sentences and answers are doubly indexed by image id first and then question number within that image (0 indexed) :param args: :return: """ data_dir = args.data_dir target_dir = args.target_dir questions_dir = os.path.join(data_dir, "questions") raw_sents_path = os.path.join(target_dir, "raw_sents.json") answers_path = os.path.join(target_dir, "answers.json") meta_data_path = os.path.join(target_dir, "meta_data.json") meta_data = json.load(open(meta_data_path, "r")) sentss_dict = {} answers_dict = {} ques_names = sorted([name for name in os.listdir(questions_dir) if os.path.splitext(name)[1].endswith(".json")], key=lambda x: int(os.path.splitext(os.path.splitext(x)[0])[0])) num_choices = 0 num_questions = 0 max_sent_size = 0 pbar = get_pbar(len(ques_names)).start() for i, ques_name in enumerate(ques_names): image_name, _ = os.path.splitext(ques_name) image_id, _ = os.path.splitext(image_name) sentss = [] answers = [] ques_path = os.path.join(questions_dir, ques_name) ques = json.load(open(ques_path, "r")) for ques_id, (ques_text, d) in enumerate(ques['questions'].items()): if d['abcLabel']: continue sents = [_tokenize(qa2hypo(ques_text, choice, args.qa2hypo, args.qa2hypo_path)) for choice in d['answerTexts']] max_sent_size = max(max_sent_size, max(len(sent) for sent in sents)) assert not num_choices or num_choices == len(sents), "number of choices don't match: %s" % ques_name num_choices = len(sents) sentss.append(sents) answers.append(d['correctAnswer']) num_questions += 1 sentss_dict[image_id] = sentss answers_dict[image_id] = answers pbar.update(i) pbar.finish() meta_data['num_choices'] = num_choices meta_data['max_sent_size'] = max_sent_size print("number of questions: %d" % num_questions) print("number of choices: %d" % num_choices) print("max sent size: %d" % max_sent_size) print("dumping json file ... ") json.dump(sentss_dict, open(raw_sents_path, "w")) json.dump(answers_dict, open(answers_path, "w")) json.dump(meta_data, open(meta_data_path, "w")) print("done")
def eval(self, data_set, is_val=False, eval_tensor_names=()): assert isinstance(data_set, DataSet) assert self.initialized, "Initialize tower before training." params = self.params sess = self.sess epoch_op = self.tensors['epoch'] dn = data_set.get_num_batches(partial=True) if is_val: pn = params.val_num_batches num_batches = pn if 0 <= pn <= dn else dn else: pn = params.test_num_batches num_batches = pn if 0 <= pn <= dn else dn num_iters = int(np.ceil(num_batches / self.num_towers)) num_corrects, total = 0, 0 eval_values = [] idxs = [] losses = [] N = data_set.batch_size * num_batches if N > data_set.num_examples: N = data_set.num_examples string = "eval on %s, N=%d|" % (data_set.name, N) pbar = get_pbar(num_iters, prefix=string).start() for iter_idx in range(num_iters): batches = [] for _ in range(self.num_towers): if data_set.has_next_batch(partial=True): idxs.extend(data_set.get_batch_idxs(partial=True)) batches.append(data_set.get_next_labeled_batch(partial=True)) (cur_num_corrects, cur_loss, _, global_step), eval_value_batches = \ self._eval_batches(batches, eval_tensor_names=eval_tensor_names) num_corrects += cur_num_corrects total += sum(len(batch[0]) for batch in batches) for eval_value_batch in eval_value_batches: eval_values.append([x.tolist() for x in eval_value_batch]) # numpy.array.toList losses.append(cur_loss) pbar.update(iter_idx) pbar.finish() loss = np.mean(losses) data_set.reset() epoch = sess.run(epoch_op) print("at epoch %d: acc = %.2f%% = %d / %d, loss = %.4f" % (epoch, 100 * float(num_corrects)/total, num_corrects, total, loss)) # For outputting eval json files ids = [data_set.idx2id[idx] for idx in idxs] zipped_eval_values = [list(itertools.chain(*each)) for each in zip(*eval_values)] values = {name: values for name, values in zip(eval_tensor_names, zipped_eval_values)} out = {'ids': ids, 'values': values} eval_path = os.path.join(params.eval_dir, "%s_%s.json" % (data_set.name, str(epoch).zfill(4))) json.dump(out, open(eval_path, 'w'))
def train(self, train_data_set, val_data_set=None, eval_tensor_names=()): assert isinstance(train_data_set, DataSet) assert self.initialized, "Initialize tower before training." # TODO : allow partial batch sess = self.sess writer = self.writer params = self.params num_epochs = params.num_epochs num_batches = params.train_num_batches if params.train_num_batches >= 0 else train_data_set.get_num_batches( partial=False) num_iters_per_epoch = int(num_batches / self.num_towers) num_digits = int(np.log10(num_batches)) epoch_op = self.tensors['epoch'] epoch = sess.run(epoch_op) print("training %d epochs ... " % num_epochs) print("num iters per epoch: %d" % num_iters_per_epoch) print("starting from epoch %d." % (epoch + 1)) while epoch < num_epochs: train_args = self._get_train_args(epoch) pbar = get_pbar(num_iters_per_epoch, "epoch %s|" % str(epoch + 1).zfill(num_digits)).start() for iter_idx in range(num_iters_per_epoch): batches = [ train_data_set.get_next_labeled_batch() for _ in range(self.num_towers) ] _, summary, global_step = self._train_batches( batches, **train_args) writer.add_summary(summary, global_step) pbar.update(iter_idx) pbar.finish() train_data_set.complete_epoch() assign_op = epoch_op.assign_add(1) _, epoch = sess.run([assign_op, epoch_op]) if val_data_set and epoch % params.val_period == 0: self.eval(train_data_set, is_val=True, eval_tensor_names=eval_tensor_names) self.eval(val_data_set, is_val=True, eval_tensor_names=eval_tensor_names) if epoch % params.save_period == 0: self.save()
def load_all(data_dir): annos_dir = path.join(data_dir, 'annotations') images_dir = path.join(data_dir, 'images') questions_dir = path.join(data_dir, 'questions') anno_dict = {} questions_dict = {} choicess_dict = {} answers_dict = {} image_ids = sorted([ path.splitext(name)[0] for name in listdir(images_dir) if name.endswith(".png") ], key=lambda x: int(x)) pbar = get_pbar(len(image_ids)).start() for i, image_id in enumerate(image_ids): json_name = "%s.png.json" % image_id anno_path = path.join(annos_dir, json_name) ques_path = path.join(questions_dir, json_name) if path.exists(anno_path) and path.exists(ques_path): anno = json.load(open(anno_path, "r")) ques = json.load(open(ques_path, "r")) questions = [] choicess = [] answers = [] for question, d in ques['questions'].items(): if not d['abcLabel']: choices = d['answerTexts'] answer = d['correctAnswer'] questions.append(question) choicess.append(choices) answers.append(answer) questions_dict[image_id] = questions choicess_dict[image_id] = choicess answers_dict[image_id] = answers anno_dict[image_id] = anno pbar.update(i) pbar.finish() return anno_dict, questions_dict, choicess_dict, answers_dict
def train(self, train_data_set, val_data_set=None, eval_tensor_names=()): assert isinstance(train_data_set, DataSet) assert self.initialized, "Initialize tower before training." # TODO : allow partial batch sess = self.sess writer = self.writer params = self.params num_epochs = params.num_epochs num_batches = params.train_num_batches if params.train_num_batches >= 0 else train_data_set.get_num_batches(partial=False) num_iters_per_epoch = int(num_batches / self.num_towers) num_digits = int(np.log10(num_batches)) epoch_op = self.tensors['epoch'] epoch = sess.run(epoch_op) print("training %d epochs ... " % num_epochs) print("num iters per epoch: %d" % num_iters_per_epoch) print("starting from epoch %d." % (epoch+1)) while epoch < num_epochs: train_args = self._get_train_args(epoch) pbar = get_pbar(num_iters_per_epoch, "epoch %s|" % str(epoch+1).zfill(num_digits)).start() for iter_idx in range(num_iters_per_epoch): batches = [train_data_set.get_next_labeled_batch() for _ in range(self.num_towers)] _, summary, global_step = self._train_batches(batches, **train_args) writer.add_summary(summary, global_step) pbar.update(iter_idx) pbar.finish() train_data_set.complete_epoch() assign_op = epoch_op.assign_add(1) _, epoch = sess.run([assign_op, epoch_op]) if val_data_set and epoch % params.val_period == 0: self.eval(train_data_set, is_val=True, eval_tensor_names=eval_tensor_names) self.eval(val_data_set, is_val=True, eval_tensor_names=eval_tensor_names) if epoch % params.save_period == 0: self.save()
def build_vocab(args): target_dir = args.target_dir vocab_path = os.path.join(target_dir, "vocab.json") emb_mat_path = os.path.join(target_dir, "init_emb_mat.h5") raw_sents_path = os.path.join(target_dir, "raw_sents.json") raw_facts_path = os.path.join(target_dir, "raw_facts.json") raw_sentss_dict = json.load(open(raw_sents_path, 'r')) raw_facts_dict = json.load(open(raw_facts_path, 'r')) meta_data_path = os.path.join(target_dir, "meta_data.json") meta_data = json.load(open(meta_data_path, 'r')) glove_path = args.glove_path word_counter = defaultdict(int) for image_id, raw_sentss in raw_sentss_dict.items(): for raw_sents in raw_sentss: for raw_sent in raw_sents: for word in raw_sent: _vadd(word_counter, word) for image_id, raw_facts in raw_facts_dict.items(): for raw_fact in raw_facts: for word in raw_fact: _vadd(word_counter, word) word_list, counts = zip(*sorted([pair for pair in word_counter.items()], key=lambda x: -x[1])) freq = 5 print("top %d frequent words:" % freq) for word, count in zip(word_list[:freq], counts[:freq]): print("%r: %d" % (word, count)) features = {} word_size = 0 print("reading %s ... " % glove_path) with open(glove_path, 'r') as fp: for line in fp: array = line.lstrip().rstrip().split(" ") word = array[0] if word in word_counter: vector = list(map(float, array[1:])) features[word] = vector word_size = len(vector) print("done") vocab_word_list = [word for word in word_list if word in features] unknown_word_list = [word for word in word_list if word not in features] vocab_size = len(features) + 1 f = h5py.File(emb_mat_path, 'w') emb_mat = f.create_dataset('data', [vocab_size, word_size], dtype='float') vocab = {} pbar = get_pbar(len(vocab_word_list)).start() for i, word in enumerate(vocab_word_list): emb_mat[i+1, :] = features[word] vocab[word] = i + 1 pbar.update(i) pbar.finish() vocab['UNK'] = 0 meta_data['vocab_size'] = vocab_size meta_data['word_size'] = word_size print("num of distinct words: %d" % len(word_counter)) print("vocab size: %d" % vocab_size) print("word size: %d" % word_size) print("dumping json file ... ") f.close() json.dump(vocab, open(vocab_path, "w")) json.dump(meta_data, open(meta_data_path, "w")) print("done")
def split_dqa(args): data_dir = args.data_dir first_dir = args.first_dir second_dir = args.second_dir if not os.path.exists(first_dir): os.mkdir(first_dir) if second_dir and not os.path.exists(second_dir): os.mkdir(second_dir) num = args.num image_names = [ name for name in os.listdir(os.path.join(data_dir, "images")) if name.endswith(".png") and args.start <= int(os.path.splitext(name)[0]) < args.stop ] image_names = sorted(image_names, key=lambda x: int(os.path.splitext(x)[0])) if args.random == 'True': random.shuffle(image_names) if num: pbar = get_pbar(len(image_names)).start() for i, image_name in enumerate(image_names): image_id, ext = os.path.splitext(image_name) json_name = "%s.json" % image_name if i < num: to_dir = first_dir elif second_dir: to_dir = second_dir else: pbar.update(i) continue subdirs = ['images', 'annotations', 'questions'] if args.label == 'True': subdirs.append('imagesReplacedText') for subdir in subdirs: folder_path = os.path.join(to_dir, subdir) if not os.path.exists(folder_path): os.mkdir(folder_path) if args.skip_images == 'False': subdirs = ['images'] if args.label == 'True': subdirs.append('imagesReplacedText') for subdir in subdirs: _copy(data_dir, to_dir, image_name, subdir=subdir) _copy(data_dir, to_dir, json_name, subdir='annotations') if args.label == 'True': _copy(data_dir, to_dir, json_name, subdir='questions') else: question_path = os.path.join(data_dir, 'questions', json_name) if os.path.exists(question_path): question_json = json.load(open(question_path, 'rb')) keys = question_json['questions'].keys() for key in keys: if question_json['questions'][key]['abcLabel']: del question_json['questions'][key] json.dump( question_json, open(os.path.join(to_dir, 'questions', json_name), 'wb')) pbar.update(i) pbar.finish() else: raise Exception() _copy(data_dir, first_dir, "categories.json") _copy(data_dir, second_dir, "categories.json")
def list_dqa_questions(args): data_dir = args.data_dir images_dir = os.path.join(data_dir, "images") questions_dir = os.path.join(data_dir, "questions") annos_dir = os.path.join(data_dir, "annotations") _id = 0 html_dir = "/tmp/list_dqa_questions_%d" % _id while os.path.exists(html_dir): _id += 1 html_dir = "/tmp/list_dqa_questions_%d" % _id cur_dir = os.path.dirname(os.path.realpath(__file__)) templates_dir = os.path.join(cur_dir, 'templates') env = Environment(loader=FileSystemLoader(templates_dir)) template = env.get_template(args.template_name) if os.path.exists(html_dir): shutil.rmtree(html_dir) os.mkdir(html_dir) headers = [ 'image_id', 'question_id', 'image', 'question', 'choices', 'answer', 'annotations' ] rows = [] image_names = [ name for name in os.listdir(images_dir) if name.endswith('png') ] image_names = sorted(image_names, key=lambda name: int(os.path.splitext(name)[0])) image_names = [ name for name in image_names if name.endswith(args.ext) and args.start <= int(os.path.splitext(name)[0]) < args.stop ] pbar = get_pbar(len(image_names)).start() for i, image_name in enumerate(image_names): image_id, _ = os.path.splitext(image_name) json_name = "%s.json" % image_name anno_path = os.path.join(annos_dir, json_name) question_path = os.path.join(questions_dir, json_name) if os.path.exists(question_path): question_dict = json.load(open(question_path, "rb")) anno_dict = json.load(open(anno_path, "rb")) for j, (question, d) in enumerate(question_dict['questions'].iteritems()): row = { 'image_id': image_id, 'question_id': str(j), 'image_url': os.path.join( "images" if not d['abcLabel'] else "imagesReplacedText", image_name), 'anno_url': os.path.join("annotations", json_name), 'question': question, 'choices': d['answerTexts'], 'answer': d['correctAnswer'] } rows.append(row) if i % args.num_im == 0: html_path = os.path.join(html_dir, "%s.html" % str(image_id).zfill(8)) if (i + 1) % args.num_im == 0 or (i + 1) == len(image_names): var_dict = { 'title': "Question List", 'image_width': args.im_width, 'headers': headers, 'rows': rows, 'show_im': args.show_im } with open(html_path, "wb") as f: f.write(template.render(**var_dict).encode('UTF-8')) rows = [] pbar.update(i) pbar.finish() os.system("ln -s %s/* %s" % (data_dir, html_dir)) os.chdir(html_dir) port = args.port host = args.host # Overriding to suppress log message class MyHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): def log_message(self, format, *args): pass handler = MyHandler httpd = SocketServer.TCPServer((host, port), handler) if args.open == 'True': os.system("open http://%s:%d" % (args.host, args.port)) print("serving at %s:%d" % (host, port)) httpd.serve_forever()
def _setup_pbar(self): self.pbar = None self.pbar = get_pbar(self.download.ID, self.download.size)
def list_results(args): model_num = args.model_num config_name = args.config_name data_type = args.data_type epoch =args.epoch configs_path = os.path.join("configs", "m%s.json" % str(model_num).zfill(2)) configs = json.load(open(configs_path, 'r')) config = configs[config_name] evals_dir = os.path.join("evals", "m%s" % str(model_num).zfill(2), config_name) evals_name = "%s_%s.json" % (data_type, str(epoch).zfill(4)) evals_path = os.path.join(evals_dir, evals_name) evals = json.load(open(evals_path, 'r')) fold_path = config['fold_path'] fold = json.load(open(fold_path, 'r')) fold_data_type = 'test' if data_type == 'val' else data_type image_ids = sorted(fold[fold_data_type], key=lambda x: int(x)) prepro_dir = config['data_dir'] meta_data_dir = os.path.join(prepro_dir, "meta_data.json") meta_data = json.load(open(meta_data_dir, "r")) data_dir = meta_data['data_dir'] _id = 0 html_dir = "/tmp/list_results%d" % _id while os.path.exists(html_dir): _id += 1 html_dir = "/tmp/list_results%d" % _id images_dir = os.path.join(data_dir, 'images') annos_dir = os.path.join(data_dir, 'annotations') sents_path = os.path.join(prepro_dir, 'sents.json') facts_path = os.path.join(prepro_dir, 'facts.json') vocab_path = os.path.join(prepro_dir, 'vocab.json') answers_path = os.path.join(prepro_dir, 'answers.json') sentss_dict = json.load(open(sents_path, "r")) facts_dict = json.load(open(facts_path, "r")) vocab = json.load(open(vocab_path, "r")) answers_dict = json.load(open(answers_path, "r")) decoder = {idx: word for word, idx in list(vocab.items())} if os.path.exists(html_dir): shutil.rmtree(html_dir) os.mkdir(html_dir) cur_dir = os.path.dirname(os.path.realpath(__file__)) templates_dir = os.path.join(cur_dir, 'templates') env = Environment(loader=FileSystemLoader(templates_dir)) template = env.get_template(args.template_name) eval_names = list(evals['values'].keys()) eval_dd = {} for idx, id_ in enumerate(evals['ids']): eval_d = {} for name, d in list(evals['values'].items()): eval_d[name] = d[idx] eval_dd[tuple(id_)] = eval_d # headers = ['iid', 'qid', 'image', 'sents', 'answer', 'annotations', 'relations'] + eval_names headers = ['iid', 'qid', 'image', 'sents', 'annotations', 'relations', 'p', 'yp'] rows = [] pbar = get_pbar(len(sentss_dict)).start() for i, image_id in enumerate(image_ids): if image_id not in sentss_dict: continue sentss = sentss_dict[image_id] answers = answers_dict[image_id] facts = facts_dict[image_id] if image_id in facts_dict else [] decoded_facts = [_decode_sent(decoder, fact) for fact in facts] for question_id, (sents, answer) in enumerate(zip(sentss, answers)): eval_id = (image_id, question_id) eval_d = eval_dd[eval_id] if eval_id in eval_dd else None if eval_d: p_all = list(zip(*eval_d['p'])) p = p_all[:len(decoded_facts)] p = [[float("%.3f" % x) for x in y] for y in p] yp = [float("%.3f" % x) for x in eval_d['yp']] else: p, yp, sig = [], [], [] evals = [eval_d[name] if eval_d else "" for name in eval_names] image_name = "%s.png" % image_id json_name = "%s.json" % image_name image_url = os.path.join('images', image_name) anno_url = os.path.join('annotations', json_name) ap = np.argmax(yp) if len(yp) > 0 else 0 correct = len(yp) > 0 and ap == answer row = {'image_id': image_id, 'question_id': question_id, 'image_url': image_url, 'anno_url': anno_url, 'sents': [_decode_sent(decoder, sent) for sent in sents], 'answer': answer, 'facts': decoded_facts, 'p': p, 'yp': yp, 'ap': np.argmax(yp) if len(yp) > 0 else 0, 'correct': correct, } rows.append(row) if i % args.num_im == 0: html_path = os.path.join(html_dir, "%s.html" % str(image_id).zfill(8)) if (i + 1) % args.num_im == 0 or (i + 1) == len(image_ids): var_dict = {'title': "Question List", 'image_width': args.im_width, 'headers': headers, 'rows': rows, 'show_im': True if args.show_im == 'True' else False} with open(html_path, "wb") as f: f.write(template.render(**var_dict).encode('UTF-8')) rows = [] pbar.update(i) pbar.finish() os.system("ln -s %s/* %s" % (data_dir, html_dir)) os.chdir(html_dir) port = args.port host = args.host # Overriding to suppress log message class MyHandler(http.server.SimpleHTTPRequestHandler): def log_message(self, format, *args): pass handler = MyHandler httpd = socketserver.TCPServer((host, port), handler) if args.open == 'True': os.system("open http://%s:%d" % (args.host, args.port)) print(("serving at %s:%d" % (host, port))) httpd.serve_forever()