Ejemplo n.º 1
0
def sim_test(args):
    first_dir = args.first_dir
    second_dir = args.second_dir
    first_sents_path = os.path.join(first_dir, "sents.json")
    second_sents_path = os.path.join(second_dir, "sents.json")
    vocab_path = os.path.join(first_dir, "vocab.json")
    vocab = json.load(open(vocab_path, 'r'))
    inv_vocab = {idx: word for word, idx in vocab.items()}
    first_sents = json.load(open(first_sents_path, "r"))
    second_sents = json.load(open(second_sents_path, "r"))
    diff_dict = defaultdict(int)
    pbar = get_pbar(len(first_sents)).start()
    i = 0
    for first_id, sents1 in first_sents.items():
        text1 = sent_to_text(inv_vocab, sents1[0])
        min_second_id, diff = min(
            [[second_id, cdiff(sents1, sents2, len(vocab))]
             for second_id, sents2 in second_sents.items()],
            key=lambda x: x[1])
        text2 = sent_to_text(inv_vocab, second_sents[min_second_id][0])
        diff_dict[diff] += 1
        """
        if diff <= 3:
            print("%s, %s, %d" % (text1, text2, diff))
        """
        pbar.update(i)
        i += 1
    pbar.finish()
    json.dump(diff_dict, open("diff_dict.json", "w"))
Ejemplo n.º 2
0
def prepro_annos(args):
    """
    Transform DQA annotation.json -> a list of tokenized fact sentences for each image in json file
    The facts are indexed by image id.
    :param args:
    :return:
    """
    data_dir = args.data_dir
    target_dir = args.target_dir

    # For debugging
    if args.debug == 'True':
        sents_path =os.path.join(target_dir, "raw_sents.json")
        answers_path =os.path.join(target_dir, "answers.json")
        sentss_dict = json.load(open(sents_path, 'r'))
        answers_dict = json.load(open(answers_path, 'r'))

    facts_path = os.path.join(target_dir, "raw_facts.json")
    meta_data_path = os.path.join(target_dir, "meta_data.json")
    meta_data = json.load(open(meta_data_path, "r"))
    facts_dict = {}
    annos_dir = os.path.join(data_dir, "annotations")
    anno_names = [name for name in os.listdir(annos_dir) if name.endswith(".json")]
    max_num_facts = 0
    max_fact_size = 0
    pbar = get_pbar(len(anno_names)).start()
    for i, anno_name in enumerate(anno_names):
        image_name, _ = os.path.splitext(anno_name)
        image_id, _ = os.path.splitext(image_name)
        anno_path = os.path.join(annos_dir, anno_name)
        anno = json.load(open(anno_path, 'r'))
        rels = anno2rels(anno)
        id_map = _get_id_map(anno)
        text_facts = [rel2text(id_map, rel) for rel in rels]
        text_facts = list(set(_tokenize(fact) for fact in text_facts if fact is not None))
        max_fact_size = max([max_fact_size] + [len(fact) for fact in text_facts])
        # For debugging only
        if args.debug == 'True':
            if image_id in sentss_dict:
                correct_sents = [sents[answer] for sents, answer in zip(sentss_dict[image_id], answers_dict[image_id])]
                # indexed_facts.extend(correct_sents)
                # FIXME : this is very strong prior!
                text_facts = correct_sents
            else:
                text_facts = []
        facts_dict[image_id] = text_facts
        max_num_facts = max(max_num_facts, len(text_facts))
        pbar.update(i)

    pbar.finish()

    meta_data['max_num_facts'] = max_num_facts
    meta_data['max_fact_size'] = max_fact_size
    print("number of facts: %d" % sum(len(facts) for facts in facts_dict.values()))
    print("max num facts per relation: %d" % max_num_facts)
    print("max fact size: %d" % max_fact_size)
    print("dumping json files ... ")
    json.dump(meta_data, open(meta_data_path, 'w'))
    json.dump(facts_dict, open(facts_path, 'w'))
    print("done")
Ejemplo n.º 3
0
def sim_test(args):
    first_dir = args.first_dir
    second_dir = args.second_dir
    first_sents_path = os.path.join(first_dir, "sents.json")
    second_sents_path = os.path.join(second_dir, "sents.json")
    vocab_path = os.path.join(first_dir, "vocab.json")
    vocab = json.load(open(vocab_path, 'r'))
    inv_vocab = {idx: word for word, idx in vocab.items()}
    first_sents = json.load(open(first_sents_path, "r"))
    second_sents = json.load(open(second_sents_path, "r"))
    diff_dict = defaultdict(int)
    pbar = get_pbar(len(first_sents)).start()
    i = 0
    for first_id, sents1 in first_sents.items():
        text1 = sent_to_text(inv_vocab, sents1[0])
        min_second_id, diff = min([[second_id, cdiff(sents1, sents2, len(vocab))] for second_id, sents2 in second_sents.items()],
                                  key=lambda x: x[1])
        text2 = sent_to_text(inv_vocab, second_sents[min_second_id][0])
        diff_dict[diff] += 1
        """
        if diff <= 3:
            print("%s, %s, %d" % (text1, text2, diff))
        """
        pbar.update(i)
        i += 1
    pbar.finish()
    json.dump(diff_dict, open("diff_dict.json", "w"))
Ejemplo n.º 4
0
def split_dqa(args):
    data_dir = args.data_dir
    first_dir = args.first_dir
    second_dir = args.second_dir
    if not os.path.exists(first_dir):
        os.mkdir(first_dir)
    if second_dir and not os.path.exists(second_dir):
        os.mkdir(second_dir)
    num = args.num
    image_names = [name for name in os.listdir(os.path.join(data_dir, "images"))
                   if name.endswith(".png") and args.start <= int(os.path.splitext(name)[0]) < args.stop]
    image_names = sorted(image_names, key=lambda x: int(os.path.splitext(x)[0]))
    if args.random == 'True':
        random.shuffle(image_names)
    if num:
        pbar = get_pbar(len(image_names)).start()
        for i, image_name in enumerate(image_names):
            image_id, ext = os.path.splitext(image_name)
            json_name = "%s.json" % image_name
            if i < num:
                to_dir = first_dir
            elif second_dir:
                to_dir = second_dir
            else:
                pbar.update(i)
                continue

            subdirs = ['images', 'annotations', 'questions']
            if args.label == 'True':
                subdirs.append('imagesReplacedText')
            for subdir in subdirs:
                folder_path = os.path.join(to_dir, subdir)
                if not os.path.exists(folder_path):
                    os.mkdir(folder_path)
            if args.skip_images == 'False':
                subdirs = ['images']
                if args.label == 'True':
                    subdirs.append('imagesReplacedText')
                for subdir in subdirs:
                    _copy(data_dir, to_dir, image_name, subdir=subdir)
            _copy(data_dir, to_dir, json_name, subdir='annotations')

            if args.label == 'True':
                _copy(data_dir, to_dir, json_name, subdir='questions')
            else:
                question_path = os.path.join(data_dir, 'questions', json_name)
                if os.path.exists(question_path):
                    question_json = json.load(open(question_path, 'rb'))
                    keys = question_json['questions'].keys()
                    for key in keys:
                        if question_json['questions'][key]['abcLabel']:
                            del question_json['questions'][key]
                    json.dump(question_json, open(os.path.join(to_dir, 'questions', json_name), 'wb'))
            pbar.update(i)
        pbar.finish()
    else:
        raise Exception()

    _copy(data_dir, first_dir, "categories.json")
    _copy(data_dir, second_dir, "categories.json")
Ejemplo n.º 5
0
def evaluate(anno_dict, questions_dict, choicess_dict, answers_dict):
    total = 0
    correct = 0
    incorrect = 0
    guessed = 0
    pbar = get_pbar(len(anno_dict)).start()
    for i, (image_id, anno) in enumerate(anno_dict.items()):
        graph = create_graph(anno)
        questions = questions_dict[image_id]
        choicess = choicess_dict[image_id]
        answers = answers_dict[image_id]
        for question, choices, answer in zip(questions, choicess, answers):
            total += 1
            a = guess(graph, question, choices)
            if a is None:
                guessed += 1
            elif answer == a:
                correct += 1
            else:
                incorrect += 1
        pbar.update(i)
    pbar.finish()
    print("expected accuracy: (0.25 * %d + %d)/%d = %.4f" %
          (guessed, correct, total, (0.25 * guessed + correct) / total))
    print("precision: %d/%d = %.4f" % (correct, correct + incorrect, correct /
                                       (correct + incorrect)))
Ejemplo n.º 6
0
def prepro_annos(args):
    """
    Transform DQA annotation.json -> a list of tokenized fact sentences for each image in json file
    The facts are indexed by image id.
    :param args:
    :return:
    """
    data_dir = args.data_dir
    target_dir = args.target_dir

    # For debugging
    if args.debug == 'True':
        sents_path =os.path.join(target_dir, "raw_sents.json")
        answers_path =os.path.join(target_dir, "answers.json")
        sentss_dict = json.load(open(sents_path, 'r'))
        answers_dict = json.load(open(answers_path, 'r'))

    facts_path = os.path.join(target_dir, "raw_facts.json")
    meta_data_path = os.path.join(target_dir, "meta_data.json")
    meta_data = json.load(open(meta_data_path, "r"))
    facts_dict = {}
    annos_dir = os.path.join(data_dir, "predictions_052716/dpgs/")
    anno_names = [name for name in os.listdir(annos_dir) if name.endswith(".json")]
    max_num_facts = 0
    max_fact_size = 0
    pbar = get_pbar(len(anno_names)).start()
    for i, anno_name in enumerate(anno_names):
        image_name, _ = os.path.splitext(anno_name)
        image_id, _ = os.path.splitext(image_name)
        anno_path = os.path.join(annos_dir, anno_name)
        anno = json.load(open(anno_path, 'r'))["0"]
        rels = anno2rels(anno)
        id_map = _get_id_map(anno)
        text_facts = [rel2text(id_map, rel) for rel in rels]
        text_facts = list(set(_tokenize(fact) for fact in text_facts if fact is not None))
        max_fact_size = max([max_fact_size] + [len(fact) for fact in text_facts])
        # For debugging only
        if args.debug == 'True':
            if image_id in sentss_dict:
                correct_sents = [sents[answer] for sents, answer in zip(sentss_dict[image_id], answers_dict[image_id])]
                # indexed_facts.extend(correct_sents)
                # FIXME : this is very strong prior!
                text_facts = correct_sents
            else:
                text_facts = []
        facts_dict[image_id] = text_facts
        max_num_facts = max(max_num_facts, len(text_facts))
        pbar.update(i)

    pbar.finish()

    meta_data['max_num_facts'] = max_num_facts
    meta_data['max_fact_size'] = max_fact_size
    print("number of facts: %d" % sum(len(facts) for facts in facts_dict.values()))
    print("max num facts per relation: %d" % max_num_facts)
    print("max fact size: %d" % max_fact_size)
    print("dumping json files ... ")
    json.dump(meta_data, open(meta_data_path, 'w'))
    json.dump(facts_dict, open(facts_path, 'w'))
    print("done")
Ejemplo n.º 7
0
def interpret_relations(args):
    prepro_dir = args.prepro_dir
    meta_data_dir = os.path.join(prepro_dir, "meta_data.json")
    meta_data = json.load(open(meta_data_dir, "r"))
    data_dir = meta_data['data_dir']

    images_dir = os.path.join(data_dir, 'images')
    annos_dir = os.path.join(data_dir, 'annotations')
    html_path = args.html_path

    sents_path = os.path.join(prepro_dir, 'sents.json')
    relations_path = os.path.join(prepro_dir, 'relations.json')
    vocab_path = os.path.join(prepro_dir, 'vocab.json')
    answers_path = os.path.join(prepro_dir, 'answers.json')
    sentss_dict = json.load(open(sents_path, "r"))
    relations_dict = json.load(open(relations_path, "r"))
    vocab = json.load(open(vocab_path, "r"))
    answers_dict = json.load(open(answers_path, "r"))
    decoder = {idx: word for word, idx in vocab.items()}

    headers = ['iid', 'qid', 'image', 'sents', 'answer', 'annotations', 'relations']
    rows = []
    pbar = get_pbar(len(sentss_dict)).start()
    image_ids = sorted(sentss_dict.keys(), key=lambda x: int(x))
    for i, image_id in enumerate(image_ids):
        sentss = sentss_dict[image_id]
        answers = answers_dict[image_id]
        relations = relations_dict[image_id]
        decoded_relations = [_decode_relation(decoder, relation) for relation in relations]
        for question_id, (sents, answer) in enumerate(zip(sentss, answers)):
            image_name = "%s.png" % image_id
            json_name = "%s.json" % image_name
            image_path = os.path.join(images_dir, image_name)
            anno_path = os.path.join(annos_dir, json_name)
            row = {'image_id': image_id,
                   'question_id': question_id,
                   'image_url': image_path,
                   'anno_url': anno_path,
                   'sents': [_decode_sent(decoder, sent) for sent in sents],
                   'answer': answer,
                   'relations': decoded_relations}
            rows.append(row)
        pbar.update(i)
    pbar.finish()
    var_dict = {'title': "Question List: %d - %d" % (args.start, args.stop - 1),
                'image_width': args.im_width,
                'headers': headers,
                'rows': rows,
                'show_im': True if args.show_im == 'True' else False}

    cur_dir = os.path.dirname(os.path.realpath(__file__))
    templates_dir = os.path.join(cur_dir, 'templates')
    env = Environment(loader=FileSystemLoader(templates_dir))
    template = env.get_template(args.template_name)
    out = template.render(**var_dict)
    with open(html_path, "w") as f:
        f.write(out)

    os.system("open %s" % html_path)
Ejemplo n.º 8
0
def prepro_questions(args):
    """
    transform DQA questions.json files -> single statements json and single answers json.
    sentences and answers are doubly indexed by image id first and then question number within that image (0 indexed)
    :param args:
    :return:
    """
    data_dir = args.data_dir
    target_dir = args.target_dir
    questions_dir = os.path.join(data_dir, "questions")
    raw_sents_path = os.path.join(target_dir, "raw_sents.json")
    answers_path = os.path.join(target_dir, "answers.json")
    meta_data_path = os.path.join(target_dir, "meta_data.json")
    meta_data = json.load(open(meta_data_path, "r"))

    sentss_dict = {}
    answers_dict = {}

    ques_names = sorted([name for name in os.listdir(questions_dir) if os.path.splitext(name)[1].endswith(".json")],
                        key=lambda x: int(os.path.splitext(os.path.splitext(x)[0])[0]))
    num_choices = 0
    num_questions = 0
    max_sent_size = 0
    pbar = get_pbar(len(ques_names)).start()
    for i, ques_name in enumerate(ques_names):
        image_name, _ = os.path.splitext(ques_name)
        image_id, _ = os.path.splitext(image_name)
        sentss = []
        answers = []
        ques_path = os.path.join(questions_dir, ques_name)
        ques = json.load(open(ques_path, "r"))
        for ques_id, (ques_text, d) in enumerate(ques['questions'].items()):
            if d['abcLabel']:
                continue
            sents = [_tokenize(qa2hypo(ques_text, choice, args.qa2hypo, args.qa2hypo_path)) for choice in d['answerTexts']]
            max_sent_size = max(max_sent_size, max(len(sent) for sent in sents))
            assert not num_choices or num_choices == len(sents), "number of choices don't match: %s" % ques_name
            num_choices = len(sents)
            sentss.append(sents)
            answers.append(d['correctAnswer'])
            num_questions += 1
        sentss_dict[image_id] = sentss
        answers_dict[image_id] = answers
        pbar.update(i)
    pbar.finish()
    meta_data['num_choices'] = num_choices
    meta_data['max_sent_size'] = max_sent_size

    print("number of questions: %d" % num_questions)
    print("number of choices: %d" % num_choices)
    print("max sent size: %d" % max_sent_size)
    print("dumping json file ... ")
    json.dump(sentss_dict, open(raw_sents_path, "w"))
    json.dump(answers_dict, open(answers_path, "w"))
    json.dump(meta_data, open(meta_data_path, "w"))
    print("done")
Ejemplo n.º 9
0
def prepro_questions(args):
    """
    transform DQA questions.json files -> single statements json and single answers json.
    sentences and answers are doubly indexed by image id first and then question number within that image (0 indexed)
    :param args:
    :return:
    """
    data_dir = args.data_dir
    target_dir = args.target_dir
    questions_dir = os.path.join(data_dir, "questions")
    raw_sents_path = os.path.join(target_dir, "raw_sents.json")
    answers_path = os.path.join(target_dir, "answers.json")
    meta_data_path = os.path.join(target_dir, "meta_data.json")
    meta_data = json.load(open(meta_data_path, "r"))

    sentss_dict = {}
    answers_dict = {}

    ques_names = sorted([name for name in os.listdir(questions_dir) if os.path.splitext(name)[1].endswith(".json")],
                        key=lambda x: int(os.path.splitext(os.path.splitext(x)[0])[0]))
    num_choices = 0
    num_questions = 0
    max_sent_size = 0
    pbar = get_pbar(len(ques_names)).start()
    for i, ques_name in enumerate(ques_names):
        image_name, _ = os.path.splitext(ques_name)
        image_id, _ = os.path.splitext(image_name)
        sentss = []
        answers = []
        ques_path = os.path.join(questions_dir, ques_name)
        ques = json.load(open(ques_path, "r"))
        for ques_id, (ques_text, d) in enumerate(ques['questions'].items()):
            if d['abcLabel']:
                continue
            sents = [_tokenize(qa2hypo(ques_text, choice, args.qa2hypo, args.qa2hypo_path)) for choice in d['answerTexts']]
            max_sent_size = max(max_sent_size, max(len(sent) for sent in sents))
            assert not num_choices or num_choices == len(sents), "number of choices don't match: %s" % ques_name
            num_choices = len(sents)
            sentss.append(sents)
            answers.append(d['correctAnswer'])
            num_questions += 1
        sentss_dict[image_id] = sentss
        answers_dict[image_id] = answers
        pbar.update(i)
    pbar.finish()
    meta_data['num_choices'] = num_choices
    meta_data['max_sent_size'] = max_sent_size

    print("number of questions: %d" % num_questions)
    print("number of choices: %d" % num_choices)
    print("max sent size: %d" % max_sent_size)
    print("dumping json file ... ")
    json.dump(sentss_dict, open(raw_sents_path, "w"))
    json.dump(answers_dict, open(answers_path, "w"))
    json.dump(meta_data, open(meta_data_path, "w"))
    print("done")
Ejemplo n.º 10
0
    def eval(self, data_set, is_val=False, eval_tensor_names=()):
        assert isinstance(data_set, DataSet)
        assert self.initialized, "Initialize tower before training."

        params = self.params
        sess = self.sess
        epoch_op = self.tensors['epoch']
        dn = data_set.get_num_batches(partial=True)
        if is_val:
            pn = params.val_num_batches
            num_batches = pn if 0 <= pn <= dn else dn
        else:
            pn = params.test_num_batches
            num_batches = pn if 0 <= pn <= dn else dn
        num_iters = int(np.ceil(num_batches / self.num_towers))
        num_corrects, total = 0, 0
        eval_values = []
        idxs = []
        losses = []
        N = data_set.batch_size * num_batches
        if N > data_set.num_examples:
            N = data_set.num_examples
        string = "eval on %s, N=%d|" % (data_set.name, N)
        pbar = get_pbar(num_iters, prefix=string).start()
        for iter_idx in range(num_iters):
            batches = []
            for _ in range(self.num_towers):
                if data_set.has_next_batch(partial=True):
                    idxs.extend(data_set.get_batch_idxs(partial=True))
                    batches.append(data_set.get_next_labeled_batch(partial=True))
            (cur_num_corrects, cur_loss, _, global_step), eval_value_batches = \
                self._eval_batches(batches, eval_tensor_names=eval_tensor_names)
            num_corrects += cur_num_corrects
            total += sum(len(batch[0]) for batch in batches)
            for eval_value_batch in eval_value_batches:
                eval_values.append([x.tolist() for x in eval_value_batch])  # numpy.array.toList
            losses.append(cur_loss)
            pbar.update(iter_idx)
        pbar.finish()
        loss = np.mean(losses)
        data_set.reset()

        epoch = sess.run(epoch_op)
        print("at epoch %d: acc = %.2f%% = %d / %d, loss = %.4f" %
              (epoch, 100 * float(num_corrects)/total, num_corrects, total, loss))

        # For outputting eval json files
        ids = [data_set.idx2id[idx] for idx in idxs]
        zipped_eval_values = [list(itertools.chain(*each)) for each in zip(*eval_values)]
        values = {name: values for name, values in zip(eval_tensor_names, zipped_eval_values)}
        out = {'ids': ids, 'values': values}
        eval_path = os.path.join(params.eval_dir, "%s_%s.json" % (data_set.name, str(epoch).zfill(4)))
        json.dump(out, open(eval_path, 'w'))
Ejemplo n.º 11
0
    def eval(self, data_set, is_val=False, eval_tensor_names=()):
        assert isinstance(data_set, DataSet)
        assert self.initialized, "Initialize tower before training."

        params = self.params
        sess = self.sess
        epoch_op = self.tensors['epoch']
        dn = data_set.get_num_batches(partial=True)
        if is_val:
            pn = params.val_num_batches
            num_batches = pn if 0 <= pn <= dn else dn
        else:
            pn = params.test_num_batches
            num_batches = pn if 0 <= pn <= dn else dn
        num_iters = int(np.ceil(num_batches / self.num_towers))
        num_corrects, total = 0, 0
        eval_values = []
        idxs = []
        losses = []
        N = data_set.batch_size * num_batches
        if N > data_set.num_examples:
            N = data_set.num_examples
        string = "eval on %s, N=%d|" % (data_set.name, N)
        pbar = get_pbar(num_iters, prefix=string).start()
        for iter_idx in range(num_iters):
            batches = []
            for _ in range(self.num_towers):
                if data_set.has_next_batch(partial=True):
                    idxs.extend(data_set.get_batch_idxs(partial=True))
                    batches.append(data_set.get_next_labeled_batch(partial=True))
            (cur_num_corrects, cur_loss, _, global_step), eval_value_batches = \
                self._eval_batches(batches, eval_tensor_names=eval_tensor_names)
            num_corrects += cur_num_corrects
            total += sum(len(batch[0]) for batch in batches)
            for eval_value_batch in eval_value_batches:
                eval_values.append([x.tolist() for x in eval_value_batch])  # numpy.array.toList
            losses.append(cur_loss)
            pbar.update(iter_idx)
        pbar.finish()
        loss = np.mean(losses)
        data_set.reset()

        epoch = sess.run(epoch_op)
        print("at epoch %d: acc = %.2f%% = %d / %d, loss = %.4f" %
              (epoch, 100 * float(num_corrects)/total, num_corrects, total, loss))

        # For outputting eval json files
        ids = [data_set.idx2id[idx] for idx in idxs]
        zipped_eval_values = [list(itertools.chain(*each)) for each in zip(*eval_values)]
        values = {name: values for name, values in zip(eval_tensor_names, zipped_eval_values)}
        out = {'ids': ids, 'values': values}
        eval_path = os.path.join(params.eval_dir, "%s_%s.json" % (data_set.name, str(epoch).zfill(4)))
        json.dump(out, open(eval_path, 'w'))
Ejemplo n.º 12
0
    def train(self, train_data_set, val_data_set=None, eval_tensor_names=()):
        assert isinstance(train_data_set, DataSet)
        assert self.initialized, "Initialize tower before training."
        # TODO : allow partial batch

        sess = self.sess
        writer = self.writer
        params = self.params
        num_epochs = params.num_epochs
        num_batches = params.train_num_batches if params.train_num_batches >= 0 else train_data_set.get_num_batches(
            partial=False)
        num_iters_per_epoch = int(num_batches / self.num_towers)
        num_digits = int(np.log10(num_batches))

        epoch_op = self.tensors['epoch']
        epoch = sess.run(epoch_op)
        print("training %d epochs ... " % num_epochs)
        print("num iters per epoch: %d" % num_iters_per_epoch)
        print("starting from epoch %d." % (epoch + 1))
        while epoch < num_epochs:
            train_args = self._get_train_args(epoch)
            pbar = get_pbar(num_iters_per_epoch, "epoch %s|" %
                            str(epoch + 1).zfill(num_digits)).start()
            for iter_idx in range(num_iters_per_epoch):
                batches = [
                    train_data_set.get_next_labeled_batch()
                    for _ in range(self.num_towers)
                ]
                _, summary, global_step = self._train_batches(
                    batches, **train_args)
                writer.add_summary(summary, global_step)
                pbar.update(iter_idx)
            pbar.finish()
            train_data_set.complete_epoch()

            assign_op = epoch_op.assign_add(1)
            _, epoch = sess.run([assign_op, epoch_op])

            if val_data_set and epoch % params.val_period == 0:
                self.eval(train_data_set,
                          is_val=True,
                          eval_tensor_names=eval_tensor_names)
                self.eval(val_data_set,
                          is_val=True,
                          eval_tensor_names=eval_tensor_names)

            if epoch % params.save_period == 0:
                self.save()
Ejemplo n.º 13
0
def load_all(data_dir):
    annos_dir = path.join(data_dir, 'annotations')
    images_dir = path.join(data_dir, 'images')
    questions_dir = path.join(data_dir, 'questions')

    anno_dict = {}
    questions_dict = {}
    choicess_dict = {}
    answers_dict = {}

    image_ids = sorted([
        path.splitext(name)[0]
        for name in listdir(images_dir) if name.endswith(".png")
    ],
                       key=lambda x: int(x))
    pbar = get_pbar(len(image_ids)).start()
    for i, image_id in enumerate(image_ids):
        json_name = "%s.png.json" % image_id
        anno_path = path.join(annos_dir, json_name)
        ques_path = path.join(questions_dir, json_name)
        if path.exists(anno_path) and path.exists(ques_path):
            anno = json.load(open(anno_path, "r"))
            ques = json.load(open(ques_path, "r"))

            questions = []
            choicess = []
            answers = []
            for question, d in ques['questions'].items():
                if not d['abcLabel']:
                    choices = d['answerTexts']
                    answer = d['correctAnswer']
                    questions.append(question)
                    choicess.append(choices)
                    answers.append(answer)

            questions_dict[image_id] = questions
            choicess_dict[image_id] = choicess
            answers_dict[image_id] = answers
            anno_dict[image_id] = anno
        pbar.update(i)
    pbar.finish()

    return anno_dict, questions_dict, choicess_dict, answers_dict
Ejemplo n.º 14
0
    def train(self, train_data_set, val_data_set=None, eval_tensor_names=()):
        assert isinstance(train_data_set, DataSet)
        assert self.initialized, "Initialize tower before training."
        # TODO : allow partial batch

        sess = self.sess
        writer = self.writer
        params = self.params
        num_epochs = params.num_epochs
        num_batches = params.train_num_batches if params.train_num_batches >= 0 else train_data_set.get_num_batches(partial=False)
        num_iters_per_epoch = int(num_batches / self.num_towers)
        num_digits = int(np.log10(num_batches))

        epoch_op = self.tensors['epoch']
        epoch = sess.run(epoch_op)
        print("training %d epochs ... " % num_epochs)
        print("num iters per epoch: %d" % num_iters_per_epoch)
        print("starting from epoch %d." % (epoch+1))
        while epoch < num_epochs:
            train_args = self._get_train_args(epoch)
            pbar = get_pbar(num_iters_per_epoch, "epoch %s|" % str(epoch+1).zfill(num_digits)).start()
            for iter_idx in range(num_iters_per_epoch):
                batches = [train_data_set.get_next_labeled_batch() for _ in range(self.num_towers)]
                _, summary, global_step = self._train_batches(batches, **train_args)
                writer.add_summary(summary, global_step)
                pbar.update(iter_idx)
            pbar.finish()
            train_data_set.complete_epoch()

            assign_op = epoch_op.assign_add(1)
            _, epoch = sess.run([assign_op, epoch_op])

            if val_data_set and epoch % params.val_period == 0:
                self.eval(train_data_set, is_val=True, eval_tensor_names=eval_tensor_names)
                self.eval(val_data_set, is_val=True, eval_tensor_names=eval_tensor_names)

            if epoch % params.save_period == 0:
                self.save()
Ejemplo n.º 15
0
def build_vocab(args):
    target_dir = args.target_dir
    vocab_path = os.path.join(target_dir, "vocab.json")
    emb_mat_path = os.path.join(target_dir, "init_emb_mat.h5")
    raw_sents_path = os.path.join(target_dir, "raw_sents.json")
    raw_facts_path = os.path.join(target_dir, "raw_facts.json")
    raw_sentss_dict = json.load(open(raw_sents_path, 'r'))
    raw_facts_dict = json.load(open(raw_facts_path, 'r'))

    meta_data_path = os.path.join(target_dir, "meta_data.json")
    meta_data = json.load(open(meta_data_path, 'r'))
    glove_path = args.glove_path

    word_counter = defaultdict(int)

    for image_id, raw_sentss in raw_sentss_dict.items():
        for raw_sents in raw_sentss:
            for raw_sent in raw_sents:
                for word in raw_sent:
                    _vadd(word_counter, word)

    for image_id, raw_facts in raw_facts_dict.items():
        for raw_fact in raw_facts:
            for word in raw_fact:
                _vadd(word_counter, word)

    word_list, counts = zip(*sorted([pair for pair in word_counter.items()], key=lambda x: -x[1]))
    freq = 5
    print("top %d frequent words:" % freq)
    for word, count in zip(word_list[:freq], counts[:freq]):
        print("%r: %d" % (word, count))

    features = {}
    word_size = 0
    print("reading %s ... " % glove_path)
    with open(glove_path, 'r') as fp:
        for line in fp:
            array = line.lstrip().rstrip().split(" ")
            word = array[0]
            if word in word_counter:
                vector = list(map(float, array[1:]))
                features[word] = vector
                word_size = len(vector)
    print("done")
    vocab_word_list = [word for word in word_list if word in features]
    unknown_word_list = [word for word in word_list if word not in features]
    vocab_size = len(features) + 1

    f = h5py.File(emb_mat_path, 'w')
    emb_mat = f.create_dataset('data', [vocab_size, word_size], dtype='float')
    vocab = {}
    pbar = get_pbar(len(vocab_word_list)).start()
    for i, word in enumerate(vocab_word_list):
        emb_mat[i+1, :] = features[word]
        vocab[word] = i + 1
        pbar.update(i)
    pbar.finish()
    vocab['UNK'] = 0

    meta_data['vocab_size'] = vocab_size
    meta_data['word_size'] = word_size
    print("num of distinct words: %d" % len(word_counter))
    print("vocab size: %d" % vocab_size)
    print("word size: %d" % word_size)

    print("dumping json file ... ")
    f.close()
    json.dump(vocab, open(vocab_path, "w"))
    json.dump(meta_data, open(meta_data_path, "w"))
    print("done")
Ejemplo n.º 16
0
def split_dqa(args):
    data_dir = args.data_dir
    first_dir = args.first_dir
    second_dir = args.second_dir
    if not os.path.exists(first_dir):
        os.mkdir(first_dir)
    if second_dir and not os.path.exists(second_dir):
        os.mkdir(second_dir)
    num = args.num
    image_names = [
        name for name in os.listdir(os.path.join(data_dir, "images"))
        if name.endswith(".png")
        and args.start <= int(os.path.splitext(name)[0]) < args.stop
    ]
    image_names = sorted(image_names,
                         key=lambda x: int(os.path.splitext(x)[0]))
    if args.random == 'True':
        random.shuffle(image_names)
    if num:
        pbar = get_pbar(len(image_names)).start()
        for i, image_name in enumerate(image_names):
            image_id, ext = os.path.splitext(image_name)
            json_name = "%s.json" % image_name
            if i < num:
                to_dir = first_dir
            elif second_dir:
                to_dir = second_dir
            else:
                pbar.update(i)
                continue

            subdirs = ['images', 'annotations', 'questions']
            if args.label == 'True':
                subdirs.append('imagesReplacedText')
            for subdir in subdirs:
                folder_path = os.path.join(to_dir, subdir)
                if not os.path.exists(folder_path):
                    os.mkdir(folder_path)
            if args.skip_images == 'False':
                subdirs = ['images']
                if args.label == 'True':
                    subdirs.append('imagesReplacedText')
                for subdir in subdirs:
                    _copy(data_dir, to_dir, image_name, subdir=subdir)
            _copy(data_dir, to_dir, json_name, subdir='annotations')

            if args.label == 'True':
                _copy(data_dir, to_dir, json_name, subdir='questions')
            else:
                question_path = os.path.join(data_dir, 'questions', json_name)
                if os.path.exists(question_path):
                    question_json = json.load(open(question_path, 'rb'))
                    keys = question_json['questions'].keys()
                    for key in keys:
                        if question_json['questions'][key]['abcLabel']:
                            del question_json['questions'][key]
                    json.dump(
                        question_json,
                        open(os.path.join(to_dir, 'questions', json_name),
                             'wb'))
            pbar.update(i)
        pbar.finish()
    else:
        raise Exception()

    _copy(data_dir, first_dir, "categories.json")
    _copy(data_dir, second_dir, "categories.json")
Ejemplo n.º 17
0
def list_dqa_questions(args):
    data_dir = args.data_dir
    images_dir = os.path.join(data_dir, "images")
    questions_dir = os.path.join(data_dir, "questions")
    annos_dir = os.path.join(data_dir, "annotations")
    _id = 0
    html_dir = "/tmp/list_dqa_questions_%d" % _id
    while os.path.exists(html_dir):
        _id += 1
        html_dir = "/tmp/list_dqa_questions_%d" % _id

    cur_dir = os.path.dirname(os.path.realpath(__file__))
    templates_dir = os.path.join(cur_dir, 'templates')
    env = Environment(loader=FileSystemLoader(templates_dir))
    template = env.get_template(args.template_name)

    if os.path.exists(html_dir):
        shutil.rmtree(html_dir)
    os.mkdir(html_dir)

    headers = [
        'image_id', 'question_id', 'image', 'question', 'choices', 'answer',
        'annotations'
    ]
    rows = []
    image_names = [
        name for name in os.listdir(images_dir) if name.endswith('png')
    ]
    image_names = sorted(image_names,
                         key=lambda name: int(os.path.splitext(name)[0]))
    image_names = [
        name for name in image_names if name.endswith(args.ext)
        and args.start <= int(os.path.splitext(name)[0]) < args.stop
    ]
    pbar = get_pbar(len(image_names)).start()
    for i, image_name in enumerate(image_names):
        image_id, _ = os.path.splitext(image_name)
        json_name = "%s.json" % image_name
        anno_path = os.path.join(annos_dir, json_name)
        question_path = os.path.join(questions_dir, json_name)
        if os.path.exists(question_path):
            question_dict = json.load(open(question_path, "rb"))
            anno_dict = json.load(open(anno_path, "rb"))
            for j, (question,
                    d) in enumerate(question_dict['questions'].iteritems()):
                row = {
                    'image_id':
                    image_id,
                    'question_id':
                    str(j),
                    'image_url':
                    os.path.join(
                        "images" if not d['abcLabel'] else
                        "imagesReplacedText", image_name),
                    'anno_url':
                    os.path.join("annotations", json_name),
                    'question':
                    question,
                    'choices':
                    d['answerTexts'],
                    'answer':
                    d['correctAnswer']
                }
                rows.append(row)

        if i % args.num_im == 0:
            html_path = os.path.join(html_dir,
                                     "%s.html" % str(image_id).zfill(8))

        if (i + 1) % args.num_im == 0 or (i + 1) == len(image_names):
            var_dict = {
                'title': "Question List",
                'image_width': args.im_width,
                'headers': headers,
                'rows': rows,
                'show_im': args.show_im
            }
            with open(html_path, "wb") as f:
                f.write(template.render(**var_dict).encode('UTF-8'))
            rows = []
        pbar.update(i)
    pbar.finish()

    os.system("ln -s %s/* %s" % (data_dir, html_dir))
    os.chdir(html_dir)
    port = args.port
    host = args.host

    # Overriding to suppress log message
    class MyHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
        def log_message(self, format, *args):
            pass

    handler = MyHandler
    httpd = SocketServer.TCPServer((host, port), handler)
    if args.open == 'True':
        os.system("open http://%s:%d" % (args.host, args.port))
    print("serving at %s:%d" % (host, port))
    httpd.serve_forever()
Ejemplo n.º 18
0
 def _setup_pbar(self):
     self.pbar = None
     self.pbar = get_pbar(self.download.ID, self.download.size)
Ejemplo n.º 19
0
def list_results(args):
    model_num = args.model_num
    config_name = args.config_name
    data_type = args.data_type
    epoch =args.epoch
    configs_path = os.path.join("configs", "m%s.json" % str(model_num).zfill(2))
    configs = json.load(open(configs_path, 'r'))
    config = configs[config_name]
    evals_dir = os.path.join("evals", "m%s" % str(model_num).zfill(2), config_name)
    evals_name = "%s_%s.json" % (data_type, str(epoch).zfill(4))
    evals_path = os.path.join(evals_dir, evals_name)
    evals = json.load(open(evals_path, 'r'))

    fold_path = config['fold_path']
    fold = json.load(open(fold_path, 'r'))
    fold_data_type = 'test' if data_type == 'val' else data_type
    image_ids = sorted(fold[fold_data_type], key=lambda x: int(x))

    prepro_dir = config['data_dir']
    meta_data_dir = os.path.join(prepro_dir, "meta_data.json")
    meta_data = json.load(open(meta_data_dir, "r"))
    data_dir = meta_data['data_dir']
    _id = 0
    html_dir = "/tmp/list_results%d" % _id
    while os.path.exists(html_dir):
        _id += 1
        html_dir = "/tmp/list_results%d" % _id

    images_dir = os.path.join(data_dir, 'images')
    annos_dir = os.path.join(data_dir, 'annotations')

    sents_path = os.path.join(prepro_dir, 'sents.json')
    facts_path = os.path.join(prepro_dir, 'facts.json')
    vocab_path = os.path.join(prepro_dir, 'vocab.json')
    answers_path = os.path.join(prepro_dir, 'answers.json')
    sentss_dict = json.load(open(sents_path, "r"))
    facts_dict = json.load(open(facts_path, "r"))
    vocab = json.load(open(vocab_path, "r"))
    answers_dict = json.load(open(answers_path, "r"))
    decoder = {idx: word for word, idx in list(vocab.items())}

    if os.path.exists(html_dir):
        shutil.rmtree(html_dir)
    os.mkdir(html_dir)

    cur_dir = os.path.dirname(os.path.realpath(__file__))
    templates_dir = os.path.join(cur_dir, 'templates')
    env = Environment(loader=FileSystemLoader(templates_dir))
    template = env.get_template(args.template_name)

    eval_names = list(evals['values'].keys())
    eval_dd = {}
    for idx, id_ in enumerate(evals['ids']):
        eval_d = {}
        for name, d in list(evals['values'].items()):
            eval_d[name] = d[idx]
        eval_dd[tuple(id_)] = eval_d

    # headers = ['iid', 'qid', 'image', 'sents', 'answer', 'annotations', 'relations'] + eval_names
    headers = ['iid', 'qid', 'image', 'sents', 'annotations', 'relations', 'p', 'yp']
    rows = []
    pbar = get_pbar(len(sentss_dict)).start()
    for i, image_id in enumerate(image_ids):
        if image_id not in sentss_dict:
            continue
        sentss = sentss_dict[image_id]
        answers = answers_dict[image_id]
        facts = facts_dict[image_id] if image_id in facts_dict else []
        decoded_facts = [_decode_sent(decoder, fact) for fact in facts]
        for question_id, (sents, answer) in enumerate(zip(sentss, answers)):
            eval_id = (image_id, question_id)
            eval_d = eval_dd[eval_id] if eval_id in eval_dd else None

            if eval_d:
                p_all = list(zip(*eval_d['p']))
                p = p_all[:len(decoded_facts)]
                p = [[float("%.3f" % x) for x in y] for y in p]
                yp = [float("%.3f" % x) for x in eval_d['yp']]
            else:
                p, yp, sig = [], [], []

            evals = [eval_d[name] if eval_d else "" for name in eval_names]
            image_name = "%s.png" % image_id
            json_name = "%s.json" % image_name
            image_url = os.path.join('images', image_name)
            anno_url = os.path.join('annotations', json_name)
            ap = np.argmax(yp) if len(yp) > 0 else 0
            correct = len(yp) > 0 and ap == answer
            row = {'image_id': image_id,
                   'question_id': question_id,
                   'image_url': image_url,
                   'anno_url': anno_url,
                   'sents': [_decode_sent(decoder, sent) for sent in sents],
                   'answer': answer,
                   'facts': decoded_facts,
                   'p': p,
                   'yp': yp,
                   'ap': np.argmax(yp) if len(yp) > 0 else 0,
                   'correct': correct,
                   }

            rows.append(row)

        if i % args.num_im == 0:
            html_path = os.path.join(html_dir, "%s.html" % str(image_id).zfill(8))

        if (i + 1) % args.num_im == 0 or (i + 1) == len(image_ids):
            var_dict = {'title': "Question List",
                        'image_width': args.im_width,
                        'headers': headers,
                        'rows': rows,
                        'show_im': True if args.show_im == 'True' else False}
            with open(html_path, "wb") as f:
                f.write(template.render(**var_dict).encode('UTF-8'))
            rows = []
        pbar.update(i)
    pbar.finish()

    os.system("ln -s %s/* %s" % (data_dir, html_dir))
    os.chdir(html_dir)
    port = args.port
    host = args.host
    # Overriding to suppress log message
    class MyHandler(http.server.SimpleHTTPRequestHandler):
        def log_message(self, format, *args):
            pass
    handler = MyHandler
    httpd = socketserver.TCPServer((host, port), handler)
    if args.open == 'True':
        os.system("open http://%s:%d" % (args.host, args.port))
    print(("serving at %s:%d" % (host, port)))
    httpd.serve_forever()