Python build_vocab Exemples, iep.preprocess.build_vocab Python Exemples

Exemple #1

0

Afficher le fichier

def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_refexps_json, 'r') as f:
        refexps = json.load(f)['refexps']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in refexps[0]:
            answer_token_to_idx = build_vocab(
                (str(q['answer']) for q in refexps))
        else:
            answer_token_to_idx = None
        refexp_token_to_idx = build_vocab((q['refexp'] for q in refexps),
                                          min_token_count=args.unk_threshold,
                                          punct_to_keep=[';', ','],
                                          punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in refexps:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs, delim=';')
        vocab = {
            'refexp_token_to_idx': refexp_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['refexp_token_to_idx']:
                if word not in vocab['refexp_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['refexp_token_to_idx'])
                    vocab['refexp_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    import clevr_ref_util
    clevr_ref_util = clevr_ref_util.clevr_ref_util(args.input_scenes_json,
                                                   args.input_refexps_json)
    clevr_ref_util.load_scene_refexp()
    # Encode all refexps and programs
    print('Encoding data')
    refexps_encoded = []
    programs_encoded = []
    refexp_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    if args.num_examples != -1:
        refexps = refexps[:args.num_examples]
    for orig_idx, q in enumerate(refexps):
        if orig_idx % 500 == 0:
            print('process refexp program', orig_idx)
        refexp = q['refexp']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'refexp_family_index' in q:
            refexp_families.append(q['refexp_family_index'])
        refexp_tokens = tokenize(refexp,
                                 punct_to_keep=[';', ','],
                                 punct_to_remove=['?', '.'])
        refexp_encoded = encode(refexp_tokens,
                                vocab['refexp_token_to_idx'],
                                allow_unk=args.encode_unk == 1)
        refexps_encoded.append(refexp_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str, delim=';')
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

    # Pad encoded refexps and programs
    max_refexp_length = max(len(x) for x in refexps_encoded)
    for qe in refexps_encoded:
        while len(qe) < max_refexp_length:
            qe.append(vocab['refexp_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    refexps_encoded = np.asarray(refexps_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(refexps_encoded.shape)
    print(programs_encoded.shape)
    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('refexps', data=refexps_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        f.create_dataset('programs', data=programs_encoded)
        f.create_dataset('refexp_families', data=np.asarray(refexp_families))

        #adding the mask
        tmp_ans = []
        should_create = True
        for orig_idx, q in enumerate(refexps):
            if orig_idx % 500 == 0:
                print('process mask gt', orig_idx)
            cur_mask = clevr_ref_util.get_mask_from_refexp(
                q, args.height, args.width)
            cur_mask.astype(float)
            tmp_ans.append(cur_mask)
            if len(tmp_ans) >= 100:
                tmp_ans = np.asarray(tmp_ans)
                if should_create:
                    f.create_dataset('answers',
                                     data=tmp_ans,
                                     maxshape=(None, args.width, args.height))
                    should_create = False
                else:
                    f["answers"].resize(
                        (f["answers"].shape[0] + tmp_ans.shape[0]), axis=0)
                    f["answers"][-tmp_ans.shape[0]:] = tmp_ans
                tmp_ans = []

        if len(tmp_ans) != 0:
            tmp_ans = np.asarray(tmp_ans)
            if should_create:
                assert 1 == 0
                f.create_dataset('answers',
                                 data=tmp_ans,
                                 maxshape=(None, args.width, args.height))
                should_create = False
            else:
                tmp_ans = np.asarray(tmp_ans)
                f["answers"].resize((f["answers"].shape[0] + tmp_ans.shape[0]),
                                    axis=0)
                f["answers"][-tmp_ans.shape[0]:] = tmp_ans
            tmp_ans = []

Exemple #2

0

Afficher le fichier

Fichier : preprocess_questions.py Projet : mweiss17/clevr-iep

def main(args):
  if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
    print('Must give one of --input_vocab_json or --output_vocab_json')
    return
  if "train" in args.output_h5_file and args.multi_dir:
      subdirs = [x for x in range(25)]
  elif "val" in args.output_h5_file and args.multi_dir:
      subdirs = [25, 26]
  elif args.multi_dir:
      subdirs = [27, 28, 29]
  else:
    subdirs = []

  questions = []
  scenes = []
  for subdir in subdirs:
    question_path = os.path.join(args.input_questions_json, str(subdir), "questions.json")
    scene_path = os.path.join(args.input_scenes_json, str(subdir), "scenes.json")
    ss = json.load(open(scene_path, "r"))['scenes']
    for s in ss:
      s['cc']['subdir'] = subdir
    scenes.extend(ss)
    qs = json.load(open(question_path, "r"))['questions']
    for q in qs:
        q['subdir'] = subdir
    questions.extend(qs)
  if not questions:
    questions = json.load(open(args.input_questions_json, "r"))['questions']
  if not scenes:
    scenes = json.load(open(args.input_scenes_json, "r"))['scenes']
  if args.binary_qs_only:
    filtered_questions = []
    for q in tqdm(questions):
      if q['answer'] in [True, False] and q['question'] != "?":
        filtered_questions.append(q)
    questions = filtered_questions
  # Either create the vocab or load it from disk
  if args.input_vocab_json == '' or args.expand_vocab == 1:
    print('Building vocab')
    if 'answer' in questions[0]:
      answer_token_to_idx = build_vocab(
        (str(q['answer']) for q in questions),
      answers_only=True)
    question_token_to_idx = build_vocab(
      (q['question'] for q in questions),
      min_token_count=args.unk_threshold,
      punct_to_keep=[';', ','], punct_to_remove=['?', '.']
    )
    all_program_strs = []
    for q in questions:
      if 'program' not in q: continue
      program_str = program_to_str(q['program'], args.mode)
      if program_str is not None:
        all_program_strs.append(program_str)
    program_token_to_idx = build_vocab(all_program_strs)

    all_scene_text = []
    for scene in scenes:
      for view_name, view_struct in scene.items():
        for object in view_struct['objects']:
          all_scene_text.append(object['text']['body'])
    ocr_to_idx = build_vocab(all_scene_text)

    vocab = {
      'ocr_to_idx': ocr_to_idx,
      'question_token_to_idx': question_token_to_idx,
      'program_token_to_idx': program_token_to_idx,
      'answer_token_to_idx': answer_token_to_idx,
    }
  if args.input_vocab_json != '':
    print('Loading vocab')
    if args.expand_vocab == 1:
      new_vocab = vocab
    with open(args.input_vocab_json, 'r') as f:
      vocab = json.load(f)
    if args.expand_vocab == 1:
      num_new_words = 0
      for word in new_vocab['question_token_to_idx']:
        if word not in vocab['question_token_to_idx']:
          print('Found new word %s' % word)
          idx = len(vocab['question_token_to_idx'])
          vocab['question_token_to_idx'][word] = idx
          num_new_words += 1
      print('Found %d new words' % num_new_words)

  vocab_out_path = args.output_vocab_json.split(".")[0] + ".txt"
  if vocab_out_path is not ".txt":
    with open(vocab_out_path, "w") as out_file:
      for word in vocab['ocr_to_idx'].keys():
        out_file.write(word + "\n")

  if args.output_vocab_json != '':
    with open(args.output_vocab_json, 'w') as f:
      json.dump(vocab, f)

  # Encode all questions and programs
  print('Encoding data')
  questions_encoded = []
  programs_encoded = []
  question_families = []
  orig_idxs = []
  image_idxs = []
  answers = []
  baseline = questions[0]['image_index']
  for orig_idx, q in enumerate(questions):
    question = q['question']
    # We need to ask the same question about each view of the scene, and there are 20 views of each scene
    if q.get("subdir"):
      offset = q['image_index'] - baseline
      # num_images_per_subdir = len(os.listdir(os.path.join(args.input_scenes_json, str(subdir), "images")))
      # image_name = questions[0]['image']
      # count = 0
      # for i in range(200):
      #   image_name_2 = questions[i]['image']
      #   if image_name != image_name_2:
      #     break
      #   count += 1
      # num_questions_per_image = count
      # import pdb; pdb.set_trace()
      # offset = num_images_per_subdir * q['subdir'] + q['image_index'] * num_questions_per_image
    else:
      offset = q['image_index']

    for view in range(args.num_views):

      orig_idxs.append(orig_idx)
      image_idxs.append(offset + view)
      if 'question_family_index' in q:
        question_families.append(q['question_family_index'])
      question_tokens = tokenize(question,
                          punct_to_keep=[';', ','],
                          punct_to_remove=['?', '.'])
      question_encoded = encode(question_tokens,
                           vocab['question_token_to_idx'],
                           allow_unk=args.encode_unk == 1)
      questions_encoded.append(question_encoded)

      if 'program' in q:
        program = q['program']
        program_str = program_to_str(program, args.mode)
        program_tokens = tokenize(program_str)
        program_encoded = encode(program_tokens, vocab['program_token_to_idx'])
        programs_encoded.append(program_encoded)

      if 'answer' in q:
        try:
          answers.append(vocab['answer_token_to_idx'][str(q['answer'])])
        except Exception as e:
          print(e)
  # Pad encoded questions and programs
  max_question_length = max(len(x) for x in questions_encoded)
  for qe in questions_encoded:
    while len(qe) < max_question_length:
      qe.append(vocab['question_token_to_idx']['<NULL>'])

  if len(programs_encoded) > 0:
    max_program_length = max(len(x) for x in programs_encoded)
    for pe in programs_encoded:
      while len(pe) < max_program_length:
        pe.append(vocab['program_token_to_idx']['<NULL>'])

  # Create h5 file
  print('Writing output')
  questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
  programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
  print(questions_encoded.shape)
  print(programs_encoded.shape)
  with h5py.File(args.output_h5_file, 'w') as f:
    f.create_dataset('questions', data=questions_encoded)
    f.create_dataset('image_idxs', data=np.asarray(image_idxs))
    f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

    if len(programs_encoded) > 0:
      f.create_dataset('programs', data=programs_encoded)
    if len(question_families) > 0:
      f.create_dataset('question_families', data=np.asarray(question_families))
    if len(answers) > 0:
      f.create_dataset('answers', data=np.asarray(answers))

Exemple #3

0

Afficher le fichier

def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        print(len(questions))
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab(
                (trans_answer(q['answer']) for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][trans_answer(
                q['answer'])])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))

Exemple #4

0

Afficher le fichier

def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        # questions keys: answer, question, program, index, image_index (transformed starting from zero)
        data = json.load(f)
        # print(len(data['question']))
        # return
        imgs_idxs = set()
        questions = []
        filter_questions = []
        if args.input_filter_questions_json != []:
            with open(args.input_filter_questions_json, 'r') as fq:
                filter_questions = fq.read().splitlines()
        for idx, question in data.items():
            if len(filter_questions) > 0:
                if idx not in filter_questions:
                    img_idx = question['imageId']
                    imgs_idxs.add(img_idx)
                    q = {
                        'question': question['question'],
                        'answer': question['answer'],
                        #'program': data['program'][index],
                        'index': int(idx),
                        'image_index': img_idx,
                        #'question_family_index': data['question_family_index'][index]
                    }
                    questions.append(q)
            else:
                img_idx = question['imageId']
                imgs_idxs.add(img_idx)
                q = {
                    'question': question['question'],
                    'answer': question['answer'],
                    #'program': data['program'][index],
                    'index': int(idx),
                    'image_index': img_idx,
                    #'question_family_index': data['question_family_index'][index]
                }
                questions.append(q)
        imgs_idxs = sorted(imgs_idxs)
        mapper = {x: i for i, x in enumerate(imgs_idxs)}
        for q in questions:
            q['image_index'] = mapper[q['image_index']]

        # # DEBUG
        # print('min img index: {}'.format(min(questions, key=lambda x: x['image_index'])['image_index']))
        # print('max img index: {}'.format(max(questions, key=lambda x: x['image_index'])['image_index']))
        # return

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            # Added empty delim to keep all the answer as a token.
            answer_token_to_idx = build_vocab((q['answer'] for q in questions),
                                              delim='')
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            num_new_answers = 0

            # Apparently, train and val in miniGQA have different
            # answers.
            for word in new_vocab['answer_token_to_idx']:
                if word not in vocab['answer_token_to_idx']:
                    print('Found new answer %s' % word)
                    idx = len(vocab['answer_token_to_idx'])
                    vocab['answer_token_to_idx'][word] = idx
                    num_new_answers += 1

            print('Found %d new words' % num_new_words)
            print('Found %d new answers' % num_new_answers)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))