コード例 #1
0
ファイル: run_model.py プロジェクト: arjunakula/neurips2021
def run_single_example(args, model):
    dtype = torch.FloatTensor
    if args.use_gpu == 1:
        dtype = torch.cuda.FloatTensor

    # Build the CNN to use for feature extraction
    print('Loading CNN for feature extraction')
    cnn = build_cnn(args, dtype)

    # Load and preprocess the image
    img_size = (args.image_height, args.image_width)
    img = imread(args.image, mode='RGB')
    img = imresize(img, img_size, interp='bicubic')
    img = img.transpose(2, 0, 1)[None]
    mean = np.array([0.485, 0.456, 0.406]).reshape(1, 3, 1, 1)
    std = np.array([0.229, 0.224, 0.224]).reshape(1, 3, 1, 1)
    img = (img.astype(np.float32) / 255.0 - mean) / std

    # Use CNN to extract features for the image
    img_var = Variable(torch.FloatTensor(img).type(dtype), volatile=True)
    feats_var = cnn(img_var)

    # Tokenize the refexp
    vocab = load_vocab(args)
    refexp_tokens = tokenize(args.refexp,
                             punct_to_keep=[';', ','],
                             punct_to_remove=['?', '.'])
    refexp_encoded = encode(refexp_tokens,
                            vocab['refexp_token_to_idx'],
                            allow_unk=True)
    refexp_encoded = torch.LongTensor(refexp_encoded).view(1, -1)
    refexp_encoded = refexp_encoded.type(dtype).long()
    refexp_var = Variable(refexp_encoded, volatile=True)

    # Run the model
    print('Running the model\n')
    scores = None
    predicted_program = None
    if type(model) is tuple:
        program_generator, execution_engine = model
        program_generator.type(dtype)
        execution_engine.type(dtype)
        predicted_program = program_generator.reinforce_sample(
            refexp_var,
            temperature=args.temperature,
            argmax=(args.sample_argmax == 1))
        scores = execution_engine(feats_var, predicted_program)
    else:
        model.type(dtype)
        scores = model(refexp_var, feats_var)

    # Print results
    _, predicted_answer_idx = scores.data.cpu()[0].max(dim=0)
    predicted_answer = vocab['answer_idx_to_token'][predicted_answer_idx[0]]

    print('Question: "%s"' % args.refexp)
    print('Predicted answer: ', predicted_answer)

    if predicted_program is not None:
        print()
        print('Predicted program:')
        program = predicted_program.data.cpu()[0]
        num_inputs = 1
        for fn_idx in program:
            fn_str = vocab['program_idx_to_token'][fn_idx]
            num_inputs += iep.programs.get_num_inputs(fn_str) - 1
            print(fn_str)
            if num_inputs == 0:
                break
コード例 #2
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        print(len(questions))
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab(
                (trans_answer(q['answer']) for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][trans_answer(
                q['answer'])])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
コード例 #3
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_refexps_json, 'r') as f:
        refexps = json.load(f)['refexps']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in refexps[0]:
            answer_token_to_idx = build_vocab(
                (str(q['answer']) for q in refexps))
        else:
            answer_token_to_idx = None
        refexp_token_to_idx = build_vocab((q['refexp'] for q in refexps),
                                          min_token_count=args.unk_threshold,
                                          punct_to_keep=[';', ','],
                                          punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in refexps:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs, delim=';')
        vocab = {
            'refexp_token_to_idx': refexp_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['refexp_token_to_idx']:
                if word not in vocab['refexp_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['refexp_token_to_idx'])
                    vocab['refexp_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    import clevr_ref_util
    clevr_ref_util = clevr_ref_util.clevr_ref_util(args.input_scenes_json,
                                                   args.input_refexps_json)
    clevr_ref_util.load_scene_refexp()
    # Encode all refexps and programs
    print('Encoding data')
    refexps_encoded = []
    programs_encoded = []
    refexp_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    if args.num_examples != -1:
        refexps = refexps[:args.num_examples]
    for orig_idx, q in enumerate(refexps):
        if orig_idx % 500 == 0:
            print('process refexp program', orig_idx)
        refexp = q['refexp']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'refexp_family_index' in q:
            refexp_families.append(q['refexp_family_index'])
        refexp_tokens = tokenize(refexp,
                                 punct_to_keep=[';', ','],
                                 punct_to_remove=['?', '.'])
        refexp_encoded = encode(refexp_tokens,
                                vocab['refexp_token_to_idx'],
                                allow_unk=args.encode_unk == 1)
        refexps_encoded.append(refexp_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str, delim=';')
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

    # Pad encoded refexps and programs
    max_refexp_length = max(len(x) for x in refexps_encoded)
    for qe in refexps_encoded:
        while len(qe) < max_refexp_length:
            qe.append(vocab['refexp_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    refexps_encoded = np.asarray(refexps_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(refexps_encoded.shape)
    print(programs_encoded.shape)
    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('refexps', data=refexps_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        f.create_dataset('programs', data=programs_encoded)
        f.create_dataset('refexp_families', data=np.asarray(refexp_families))

        #adding the mask
        tmp_ans = []
        should_create = True
        for orig_idx, q in enumerate(refexps):
            if orig_idx % 500 == 0:
                print('process mask gt', orig_idx)
            cur_mask = clevr_ref_util.get_mask_from_refexp(
                q, args.height, args.width)
            cur_mask.astype(float)
            tmp_ans.append(cur_mask)
            if len(tmp_ans) >= 100:
                tmp_ans = np.asarray(tmp_ans)
                if should_create:
                    f.create_dataset('answers',
                                     data=tmp_ans,
                                     maxshape=(None, args.width, args.height))
                    should_create = False
                else:
                    f["answers"].resize(
                        (f["answers"].shape[0] + tmp_ans.shape[0]), axis=0)
                    f["answers"][-tmp_ans.shape[0]:] = tmp_ans
                tmp_ans = []

        if len(tmp_ans) != 0:
            tmp_ans = np.asarray(tmp_ans)
            if should_create:
                assert 1 == 0
                f.create_dataset('answers',
                                 data=tmp_ans,
                                 maxshape=(None, args.width, args.height))
                should_create = False
            else:
                tmp_ans = np.asarray(tmp_ans)
                f["answers"].resize((f["answers"].shape[0] + tmp_ans.shape[0]),
                                    axis=0)
                f["answers"][-tmp_ans.shape[0]:] = tmp_ans
            tmp_ans = []
コード例 #4
0
ファイル: run_model.py プロジェクト: sxtyzhangzk/clevr-iep
def run_single_example(args, model, cnn_in=None):
  dtype = torch.FloatTensor
  if args.use_gpu == 1:
    dtype = torch.cuda.FloatTensor

  # Build the CNN to use for feature extraction
  if cnn_in is None:
    print('Loading CNN for feature extraction')
    cnn = build_cnn(args, dtype)
  else:
    cnn = cnn_in

  # Load and preprocess the image
  img_size = (args.image_height, args.image_width)
  # print(img_size)
  img = imread(args.image, mode='RGB')
  img = imresize(img, img_size, interp='bicubic')
  imsave("resized.png", img)
  img_hm = img
  img = img.transpose(2, 0, 1)[None]
  mean = np.array([0.485, 0.456, 0.406]).reshape(1, 3, 1, 1)
  std = np.array([0.229, 0.224, 0.224]).reshape(1, 3, 1, 1)
  img = (img.astype(np.float32) / 255.0 - mean) / std

  # Use CNN to extract features for the image
  img_var = Variable(torch.FloatTensor(img).type(dtype), volatile=False, requires_grad=True)
  feats_var = cnn(img_var)
  # print(feats_var)

  # Tokenize the question
  vocab = load_vocab(args)
  question_tokens = tokenize(args.question,
                      punct_to_keep=[';', ','],
                      punct_to_remove=['?', '.'])
  question_encoded = encode(question_tokens,
                       vocab['question_token_to_idx'],
                       allow_unk=True)
  question_encoded = torch.LongTensor(question_encoded).view(1, -1)
  question_encoded = question_encoded.type(dtype).long()
  question_var = Variable(question_encoded, volatile=False)

  # Run the model
  print('Running the model\n')
  scores = None
  predicted_program = None

  GMAP_W = 14
  GMAP_H = 14

  IMG_W = 320
  IMG_H = 240

  # gm_ffm = [[0 for j in range(GMAP_W)] for i in range(GMAP_H)]
  # gm_cnn = [[0 for j in range(GMAP_W)] for i in range(GMAP_H)]
  gm_ffm = np.zeros([GMAP_H, GMAP_W])

  def hook(gmap, layers, grad):
    # print(grad)
    data = grad.data.cpu()
    maxvalue = 0
    for i in range(GMAP_H):
      for j in range(GMAP_W):
        for k in range(layers):
          gmap[i][j] = gmap[i][j] + data[0][k][i][j]
        if abs(gmap[i][j]) > maxvalue:
          maxvalue = abs(gmap[i][j])
    # print("maxvalue=", maxvalue)
    for i in range(GMAP_H):
      for j in range(GMAP_W):
        gmap[i][j] = abs(gmap[i][j] / maxvalue)
  
  if type(model) is tuple:
    program_generator, execution_engine = model
    program_generator.type(dtype)
    execution_engine.type(dtype)
    predicted_program = program_generator.reinforce_sample(
                          question_var,
                          temperature=args.temperature,
                          argmax=(args.sample_argmax == 1))
    scores, ffm = execution_engine(feats_var, predicted_program)
    ffm.register_hook(lambda grad: hook(gm_ffm, 128, grad))

  else:
    model.type(dtype)
    scores = model(question_var, feats_var)
    feats_var.register_hook(lambda grad: hook(gm_ffm, 1024, grad))

  print("SCORES=", scores[0][0])

  # fv = feats_var.transpose(1, 3)
  # print(fv)

  # feats_var.register_hook(lambda grad: hook(gm_cnn, 1024, grad))
  # fv[0][0][0].sum().backward()
  
  sum = scores.sum()
  sum.backward()

  x = np.zeros([GMAP_H, GMAP_W])
  y = np.zeros([GMAP_H, GMAP_W])
  # # x = [(i + 0.5) / GMAP_H * IMG_H for i in range(GMAP_H)]
  # # y = [(i + 0.5) / GMAP_H * IMG_W for i in range(GMAP_W)]

  z = np.zeros([GMAP_H, GMAP_W])
 
  for i in range(GMAP_H):
    for j in range(GMAP_W):
      x[i][j] = (i) #/ (GMAP_H-1)
      y[i][j] = (j) #/ (GMAP_W-1)
      z[i][j] = gm_ffm[i][j]
  #print(x.max())
  #print(y.max())

  x.reshape([-1])
  y.reshape([-1])
  z.reshape([-1])
  
  # # x_new = np.zeros([IMG_H * IMG_W])
  # # y_new = np.zeros([IMG_H * IMG_W])

  x_new = [i + 0.5 for i in range(IMG_H)]
  y_new = [i + 0.5 for i in range(IMG_W)]

  # # for i in range(IMG_H):
  # #   for j in range(IMG_W):
  # #     x_new[i * IMG_W + j] = i + 0.5
  # #     y_new[i * IMG_W + j] = j + 0.5 
  # # print(x_new.size)
  #x,y=np.mgrid(0:IMG_W:14j,0:IMG_H:14j)

  f = ip.RectBivariateSpline([(i+0.5) / (GMAP_H) for i in range(GMAP_H)], [(i+0.5) / (GMAP_W) for i in range(GMAP_W)], z)
  # f = ip.interp2d(x, y, z, kind='linear', fill_value=0, bounds_error=True)
  # z_new = f(x_new, y_new)
  z_new = np.zeros([IMG_H, IMG_W])
  for i in range(IMG_H):  
    for j in range(IMG_W):
      z_new[i][j] = f((i + 0.5) / IMG_H, (j+0.5) / IMG_W)[0]
  
  if args.focus_data is not None:
    with open(args.focus_data, 'w') as f:
      for row in z_new:
        for d in row:
          f.write(str(d))
          f.write(' ')
        f.write('\n')

  fimg = np.zeros([IMG_H, IMG_W, 3])
  for i in range(IMG_H):
    for j in range(IMG_W):
      val = z_new[i][j] * 255
      fimg[i][j] = [val, val, val]
  if args.focus_img is not None:
    imsave(args.focus_img, fimg)
  # tck = ip.bisplrep(x, y, z, s=0)
  # z_new = ip.bisplev(x_new, y_new, tck)

  #z_new.reshape([IMG_H, IMG_W])
  #print(z.max())
  #print(z_new.max())
  # print(z_new)

  # plt.pcolor(z)
  # plt.show()
  # plt.pcolor(z_new)
  # plt.show()

  # print(gm_ffm)

  # himg = np.array(gmap, dtype=np.float32)
  # himg = imresize(himg, img_size, interp='bicubic')
  # img_ffm = np.zeros([args.image_height, args.image_width, 3])
  # img_cnn = np.zeros([args.image_height, args.image_width, 3])

  # scale_h = args.image_height / GMAP_H
  # scale_w = args.image_width / GMAP_W
  # for i in range(args.image_height):
  #   for j in range(args.image_width):
  #     for k in range(3):
  #       img_ffm[i][j][k] = img_hm[i][j][k] * 0.5 + gm_ffm[int(i / scale_h)][int(j / scale_w)] * 255 * 0.5
  #       img_cnn[i][j][k] = img_hm[i][j][k] * 0.5 + gm_cnn[int(i / scale_h)][int(j / scale_w)] * 255 * 0.5
  
  # imsave('heatmap-ffm.png', img_ffm)
  # imsave('heatmap-cnn.png', img_cnn)

  # print(himg)

  # seaborn.heatmap(gm_ffm)
  # plt.show()

  # print("GRADIENT=", ffm.grad)

  # print(scores.backward(ffm))

  # Print results
  _, predicted_answer_idx = scores.data.cpu()[0].max(dim=0)
  predicted_answer = vocab['answer_idx_to_token'][predicted_answer_idx[0]]

  print('Question: "%s"' % args.question)
  print('Predicted answer: ', predicted_answer)

  if predicted_program is not None:
    print()
    print('Predicted program:')
    program = predicted_program.data.cpu()[0]
    num_inputs = 1
    for fn_idx in program:
      fn_str = vocab['program_idx_to_token'][fn_idx]
      num_inputs += iep.programs.get_num_inputs(fn_str) - 1
      print(fn_str)
      if num_inputs == 0:
        break
コード例 #5
0
def main(args):
  if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
    print('Must give one of --input_vocab_json or --output_vocab_json')
    return
  if "train" in args.output_h5_file and args.multi_dir:
      subdirs = [x for x in range(25)]
  elif "val" in args.output_h5_file and args.multi_dir:
      subdirs = [25, 26]
  elif args.multi_dir:
      subdirs = [27, 28, 29]
  else:
    subdirs = []

  questions = []
  scenes = []
  for subdir in subdirs:
    question_path = os.path.join(args.input_questions_json, str(subdir), "questions.json")
    scene_path = os.path.join(args.input_scenes_json, str(subdir), "scenes.json")
    ss = json.load(open(scene_path, "r"))['scenes']
    for s in ss:
      s['cc']['subdir'] = subdir
    scenes.extend(ss)
    qs = json.load(open(question_path, "r"))['questions']
    for q in qs:
        q['subdir'] = subdir
    questions.extend(qs)
  if not questions:
    questions = json.load(open(args.input_questions_json, "r"))['questions']
  if not scenes:
    scenes = json.load(open(args.input_scenes_json, "r"))['scenes']
  if args.binary_qs_only:
    filtered_questions = []
    for q in tqdm(questions):
      if q['answer'] in [True, False] and q['question'] != "?":
        filtered_questions.append(q)
    questions = filtered_questions
  # Either create the vocab or load it from disk
  if args.input_vocab_json == '' or args.expand_vocab == 1:
    print('Building vocab')
    if 'answer' in questions[0]:
      answer_token_to_idx = build_vocab(
        (str(q['answer']) for q in questions),
      answers_only=True)
    question_token_to_idx = build_vocab(
      (q['question'] for q in questions),
      min_token_count=args.unk_threshold,
      punct_to_keep=[';', ','], punct_to_remove=['?', '.']
    )
    all_program_strs = []
    for q in questions:
      if 'program' not in q: continue
      program_str = program_to_str(q['program'], args.mode)
      if program_str is not None:
        all_program_strs.append(program_str)
    program_token_to_idx = build_vocab(all_program_strs)

    all_scene_text = []
    for scene in scenes:
      for view_name, view_struct in scene.items():
        for object in view_struct['objects']:
          all_scene_text.append(object['text']['body'])
    ocr_to_idx = build_vocab(all_scene_text)

    vocab = {
      'ocr_to_idx': ocr_to_idx,
      'question_token_to_idx': question_token_to_idx,
      'program_token_to_idx': program_token_to_idx,
      'answer_token_to_idx': answer_token_to_idx,
    }
  if args.input_vocab_json != '':
    print('Loading vocab')
    if args.expand_vocab == 1:
      new_vocab = vocab
    with open(args.input_vocab_json, 'r') as f:
      vocab = json.load(f)
    if args.expand_vocab == 1:
      num_new_words = 0
      for word in new_vocab['question_token_to_idx']:
        if word not in vocab['question_token_to_idx']:
          print('Found new word %s' % word)
          idx = len(vocab['question_token_to_idx'])
          vocab['question_token_to_idx'][word] = idx
          num_new_words += 1
      print('Found %d new words' % num_new_words)

  vocab_out_path = args.output_vocab_json.split(".")[0] + ".txt"
  if vocab_out_path is not ".txt":
    with open(vocab_out_path, "w") as out_file:
      for word in vocab['ocr_to_idx'].keys():
        out_file.write(word + "\n")

  if args.output_vocab_json != '':
    with open(args.output_vocab_json, 'w') as f:
      json.dump(vocab, f)

  # Encode all questions and programs
  print('Encoding data')
  questions_encoded = []
  programs_encoded = []
  question_families = []
  orig_idxs = []
  image_idxs = []
  answers = []
  baseline = questions[0]['image_index']
  for orig_idx, q in enumerate(questions):
    question = q['question']
    # We need to ask the same question about each view of the scene, and there are 20 views of each scene
    if q.get("subdir"):
      offset = q['image_index'] - baseline
      # num_images_per_subdir = len(os.listdir(os.path.join(args.input_scenes_json, str(subdir), "images")))
      # image_name = questions[0]['image']
      # count = 0
      # for i in range(200):
      #   image_name_2 = questions[i]['image']
      #   if image_name != image_name_2:
      #     break
      #   count += 1
      # num_questions_per_image = count
      # import pdb; pdb.set_trace()
      # offset = num_images_per_subdir * q['subdir'] + q['image_index'] * num_questions_per_image
    else:
      offset = q['image_index']

    for view in range(args.num_views):

      orig_idxs.append(orig_idx)
      image_idxs.append(offset + view)
      if 'question_family_index' in q:
        question_families.append(q['question_family_index'])
      question_tokens = tokenize(question,
                          punct_to_keep=[';', ','],
                          punct_to_remove=['?', '.'])
      question_encoded = encode(question_tokens,
                           vocab['question_token_to_idx'],
                           allow_unk=args.encode_unk == 1)
      questions_encoded.append(question_encoded)

      if 'program' in q:
        program = q['program']
        program_str = program_to_str(program, args.mode)
        program_tokens = tokenize(program_str)
        program_encoded = encode(program_tokens, vocab['program_token_to_idx'])
        programs_encoded.append(program_encoded)

      if 'answer' in q:
        try:
          answers.append(vocab['answer_token_to_idx'][str(q['answer'])])
        except Exception as e:
          print(e)
  # Pad encoded questions and programs
  max_question_length = max(len(x) for x in questions_encoded)
  for qe in questions_encoded:
    while len(qe) < max_question_length:
      qe.append(vocab['question_token_to_idx']['<NULL>'])

  if len(programs_encoded) > 0:
    max_program_length = max(len(x) for x in programs_encoded)
    for pe in programs_encoded:
      while len(pe) < max_program_length:
        pe.append(vocab['program_token_to_idx']['<NULL>'])

  # Create h5 file
  print('Writing output')
  questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
  programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
  print(questions_encoded.shape)
  print(programs_encoded.shape)
  with h5py.File(args.output_h5_file, 'w') as f:
    f.create_dataset('questions', data=questions_encoded)
    f.create_dataset('image_idxs', data=np.asarray(image_idxs))
    f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

    if len(programs_encoded) > 0:
      f.create_dataset('programs', data=programs_encoded)
    if len(question_families) > 0:
      f.create_dataset('question_families', data=np.asarray(question_families))
    if len(answers) > 0:
      f.create_dataset('answers', data=np.asarray(answers))
コード例 #6
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        # questions keys: answer, question, program, index, image_index (transformed starting from zero)
        data = json.load(f)
        # print(len(data['question']))
        # return
        imgs_idxs = set()
        questions = []
        filter_questions = []
        if args.input_filter_questions_json != []:
            with open(args.input_filter_questions_json, 'r') as fq:
                filter_questions = fq.read().splitlines()
        for idx, question in data.items():
            if len(filter_questions) > 0:
                if idx not in filter_questions:
                    img_idx = question['imageId']
                    imgs_idxs.add(img_idx)
                    q = {
                        'question': question['question'],
                        'answer': question['answer'],
                        #'program': data['program'][index],
                        'index': int(idx),
                        'image_index': img_idx,
                        #'question_family_index': data['question_family_index'][index]
                    }
                    questions.append(q)
            else:
                img_idx = question['imageId']
                imgs_idxs.add(img_idx)
                q = {
                    'question': question['question'],
                    'answer': question['answer'],
                    #'program': data['program'][index],
                    'index': int(idx),
                    'image_index': img_idx,
                    #'question_family_index': data['question_family_index'][index]
                }
                questions.append(q)
        imgs_idxs = sorted(imgs_idxs)
        mapper = {x: i for i, x in enumerate(imgs_idxs)}
        for q in questions:
            q['image_index'] = mapper[q['image_index']]

        # # DEBUG
        # print('min img index: {}'.format(min(questions, key=lambda x: x['image_index'])['image_index']))
        # print('max img index: {}'.format(max(questions, key=lambda x: x['image_index'])['image_index']))
        # return

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            # Added empty delim to keep all the answer as a token.
            answer_token_to_idx = build_vocab((q['answer'] for q in questions),
                                              delim='')
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            num_new_answers = 0

            # Apparently, train and val in miniGQA have different
            # answers.
            for word in new_vocab['answer_token_to_idx']:
                if word not in vocab['answer_token_to_idx']:
                    print('Found new answer %s' % word)
                    idx = len(vocab['answer_token_to_idx'])
                    vocab['answer_token_to_idx'][word] = idx
                    num_new_answers += 1

            print('Found %d new words' % num_new_words)
            print('Found %d new answers' % num_new_answers)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
コード例 #7
0
def run_single_example(args, model, image_filepath = None, counting = True):
  """
  We modify this function to return the prediction confidences (not just the prediction)
  """
  global CONSTRAINED_INDECES
  dtype = torch.FloatTensor
  if args.use_gpu == 1:
    dtype = torch.cuda.FloatTensor

  # Build the CNN to use for feature extraction
  print('Loading CNN for feature extraction')
  cnn = build_cnn(args, dtype)

  # Load and preprocess the image
  img_size = (args.image_height, args.image_width)
  if image_filepath == None:
    # img = imread(args.image, pilmode='RGB')
    img = Image.open(args.image).convert('RGB')
  else:
    print ("Found image")
    # img = imread(image_filepath, pilmode='RGB')
    img = Image.open(image_filepath).convert('RGB')
  img = img.resize(img_size, resample=Image.BICUBIC)
  img = np.array(img)
  img = img.transpose(2, 0, 1)[None]
  mean = np.array([0.485, 0.456, 0.406]).reshape(1, 3, 1, 1)
  std = np.array([0.229, 0.224, 0.224]).reshape(1, 3, 1, 1)
  img = (img.astype(np.float32) / 255.0 - mean) / std

  # Use CNN to extract features for the image
  with torch.no_grad():
    img_var = Variable(torch.FloatTensor(img).type(dtype)) #, volatile=True)
  feats_var = cnn(img_var)

  # Tokenize the question
  vocab = load_vocab(args)
  question_tokens = tokenize(args.question,
                      punct_to_keep=[';', ','],
                      punct_to_remove=['?', '.'])
  question_encoded = encode(question_tokens,
                       vocab['question_token_to_idx'],
                       allow_unk=True)
  question_encoded = torch.LongTensor(question_encoded).view(1, -1)
  question_encoded = question_encoded.type(dtype).long()
  with torch.no_grad():
    question_var = Variable(question_encoded)#, volatile=True)
  if CONSTRAINED_INDECES == None:
    set_constrained_indeces(vocab)

  # Run the model
  print('Running the model\n')
  scores = None
  predicted_program = None
  if type(model) is tuple:
    program_generator, execution_engine = model
    program_generator.type(dtype)
    execution_engine.type(dtype)
    predicted_program = program_generator.reinforce_sample(
                          question_var,
                          temperature=args.temperature,
                          argmax=(args.sample_argmax == 1))
    scores = execution_engine(feats_var, predicted_program)
  else:
    model.type(dtype)
    scores = model(question_var, feats_var)

  probability = F.softmax(scores).data.cpu()[0]

  # Print results
  _, predicted_answer_idx = scores.data.cpu()[0].max(dim=0)
  print (predicted_answer_idx)
  predicted_answer = vocab['answer_idx_to_token'][predicted_answer_idx.item()]

  print('Question: "%s"' % args.question)
  print('Predicted answer: ', predicted_answer)
  print('Predicted answer confidence: ', probability.max(dim=0))
  if predicted_program is not None:
    print()
    print('Predicted program:')
    program = predicted_program.data.cpu()[0]
    num_inputs = 1
    for fn_idx in program:
      fn_str = vocab['program_idx_to_token'][fn_idx.item()]
      num_inputs += iep.programs.get_num_inputs(fn_str) - 1
      print(fn_str)
      if num_inputs == 0:
        break
  # return the probabilities for all choices when counting
  if counting:
    return [probability[i] for i in CONSTRAINED_INDECES]
  else:
    return probability.max(dim=0)
コード例 #8
0
def compute_saliency_map(args, model, image_filepath = None, counting = True, smoothgrad = True, output_dir = "../output/"):
  """
  Compute the SmoothGrad saliency map.

  Note the saliency map is computed with respect to the classification.
  If the query is "how many spheres?", and the scene contains 1 sphere,
  the saliency map should highlight the object which most contributes
  to this classification of 1 sphere. The saliency map would be different
  for a different query (e.g., "how many cubes?")


  """
  global CONSTRAINED_INDECES
  # check if the directory exists; if not, make it
  if not os.path.isdir(output_dir):
      os.mkdir(output_dir)

  dtype = torch.FloatTensor
  if args.use_gpu == 1:
    dtype = torch.cuda.FloatTensor

  # Tokenize the question
  vocab = load_vocab(args)
  question_tokens = tokenize(args.question,
                      punct_to_keep=[';', ','],
                      punct_to_remove=['?', '.'])
  question_encoded = encode(question_tokens,
                       vocab['question_token_to_idx'],
                       allow_unk=True)
  question_encoded = torch.LongTensor(question_encoded).view(1, -1)
  question_encoded = question_encoded.type(dtype).long()
  with torch.no_grad():
    question_var = Variable(question_encoded)#, volatile=True)
  if CONSTRAINED_INDECES == None:
    set_constrained_indeces(vocab)

  # Build the CNN to use for feature extraction
  print('Loading CNN for feature extraction')
  cnn = build_cnn(args, dtype)
  cnn.eval()

  # Load and preprocess the image
  img_size = (args.image_height, args.image_width)
  if image_filepath == None:
    img = Image.open(args.image).convert('RGB')
  else:
    print ("Found image")
    img = Image.open(args.image).convert('RGB')

  img_var = preprocess(img)
  img_var = img_var.type(dtype)

  saliency = get_smoothed_mask(model, cnn, question_var, img_var, dtype, 'cuda')[0].cpu().detach().numpy()
  img = deprocess_img(img_var)
  fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
  ax1.imshow(np.asarray(img))
  ax1.axis('off')

  ax2.imshow(saliency, cmap=plt.cm.gist_heat)
  ax2.axis('off')

  ax3.imshow(np.asarray(img), alpha = 0.5)
  ax3.imshow(saliency, cmap=plt.cm.gist_heat, alpha = 0.7)
  ax3.axis('off')

  plt.savefig(output_dir + 'out.png', bbox_inches='tight', pad_inches=0)

  plt.show()

  scores, predicted_program = run_model(model, cnn, question_var, img_var, dtype)
  probability = F.softmax(scores).data.cpu()[0]
  # Print results
  _, predicted_answer_idx = scores.data.cpu()[0].max(dim=0)
  print (predicted_answer_idx)
  predicted_answer = vocab['answer_idx_to_token'][predicted_answer_idx.item()]

  print ("Predicted answer list: " + str(vocab['answer_idx_to_token']))

  print('Question: "%s"' % args.question)
  print('Predicted answer: ', predicted_answer)

  print('Confidence - 0: ', probability[4])
  print('Confidence - 1: ', probability[5])

  print('Predicted answer confidence: ', probability.max(dim=0))

  if predicted_program is not None:
    print()
    print('Predicted program:')
    program = predicted_program.data.cpu()[0]
    num_inputs = 1
    for fn_idx in program:
      fn_str = vocab['program_idx_to_token'][fn_idx.item()]
      num_inputs += iep.programs.get_num_inputs(fn_str) - 1
      print(fn_str)
      if num_inputs == 0:
        break
  # return the probabilities for all choices when counting
  if counting:
    return [probability[i] for i in CONSTRAINED_INDECES]
  else:
    return probability.max(dim=0)