def get_train_data(args): dataset = {} train_data = {} # load json file print('loading json file...') with open(args.input_json) as data_file: data = json.load(data_file) for key in data.keys(): dataset[key] = data[key] # load image feature print('loading image feature...') with h5py.File(args.input_img_h5, 'r') as hf: # -----0~82459------ tem = hf.get('images_train') img_feature = np.array(tem) # load h5 file print('loading h5 file...') with h5py.File(args.input_ques_h5, 'r') as hf: # total number of training data is 215375 # question is (26, ) tem = hf.get('ques_train') train_data['question'] = np.array(tem) # max length is 23 tem = hf.get('ques_length_train') train_data['length_q'] = np.array(tem) # total 82460 img #-----1~82460----- tem = hf.get('img_pos_train') # convert into 0~82459 train_data['img_list'] = np.array(tem) - 1 # answer is 1~1000 tem = hf.get('answers') train_data['answers'] = np.array(tem) - 1 print('Normalizing image feature') if img_norm: tem = np.sqrt(np.sum(np.multiply(img_feature, img_feature))) img_feature = np.divide(img_feature, np.tile(tem, (1, args.img_vec_dim))) return dataset, img_feature, train_data
def get_bert_embeddings(h5py, sent_idx, tokens): b_emb = [] for i, word in enumerate(tokens): line = h5py.get(str(sent_idx)) if line is None: raise Exception( f'BERT failed to find embedding: {sent_idx}, {tokens}') if word not in ['<ROOT>', '<unaligned>', '<eof>']: embedding = line[i] b_emb.append(embedding) return b_emb
def read_image_header(row, img_file): """Read some information from the image header and write into the df row. """ hdu = 0 # Note: The next line usually works, but fitsio doesn't support CONTINUE lines, which DES # image headers sometimes include. #h = fitsio.read_header(img_file, hdu) # I don't care about any of the lines the sometimes use CONITNUE (e.g. OBSERVER), so I # just remove them and make the header with the rest of the entries. f = fitsio.FITS(img_file) header_list = f[hdu].read_header_list() header_list = [ d for d in header_list if 'CONTINUE' not in d['name'] ] h = fitsio.FITSHDR(header_list) try: date = h['DATE-OBS'] date, time = date.strip().split('T',1) filter = h['FILTER'] filter = filter.split()[0] sat = h['SATURATE'] fwhm = h['FWHM'] ccdnum = int(h['CCDNUM']) detpos = h['DETPOS'].strip() telra = h['TELRA'] teldec = h['TELDEC'] telha = h['HA'] if galsim.__version__ >= '1.5.1': telra = galsim.Angle.from_hms(telra) / galsim.degrees teldec = galsim.Angle.from_dms(teldec) / galsim.degrees telha = galsim.Angle.from_hms(telha) / galsim.degrees else: telra = galsim.HMS_Angle(telra) / galsim.degrees teldec = galsim.DMS_Angle(teldec) / galsim.degrees telha = galsim.HMS_Angle(telha) / galsim.degrees airmass = float(h.get('AIRMASS',-999)) sky = float(h.get('SKYBRITE',-999)) sigsky = float(h.get('SKYSIGMA',-999)) tiling = int(h.get('TILING',0)) hex = int(h.get('HEX',0)) except Exception as e: logger.info("Caught %s",e) logger.info("Cannot read header information from %s", img_file) raise row['date'] = date row['time'] = time row['sat'] = sat row['fits_filter'] = filter row['fits_fwhm'] = fwhm row['fits_ccdnum'] = ccdnum row['telra'] = telra row['teldec'] = teldec row['telha'] = telha row['airmass'] = airmass row['sky'] = sky row['sigsky'] = sigsky row['tiling'] = tiling row['hex'] = hex
def get_data_test(args): dataset = {} test_data = {} # load json file print('loading json file...') with open(args.input_json) as data_file: data = json.load(data_file) for key in data.keys(): dataset[key] = data[key] # load image feature print('loading image feature...') with h5py.File(args.input_img_h5, 'r') as hf: # -----0~82459------ tem = hf.get('images_test') img_feature = np.array(tem) # load h5 file print('loading h5 file...') with h5py.File(args.input_ques_h5, 'r') as hf: # total number of training data is 215375 # question is (26, ) tem = hf.get('ques_test') test_data['question'] = np.array(tem) # max length is 23 tem = hf.get('ques_length_test') test_data['length_q'] = np.array(tem) # total 82460 img # -----1~82460----- tem = hf.get('img_pos_test') # convert into 0~82459 test_data['img_list'] = np.array(tem) - 1 # quiestion id tem = hf.get('question_id_test') test_data['ques_id'] = np.array(tem) # MC_answer_test tem = hf.get('MC_ans_test') test_data['MC_ans_test'] = np.array(tem) print('Normalizing image feature') if img_norm: tem = np.sqrt(np.sum(np.multiply(img_feature, img_feature))) img_feature = np.divide(img_feature, np.tile(tem, (1, args.img_vec_dim))) nb_data_test = len(test_data[u'question']) val_all_answers_dict = json.load(open(args.ans_file)) val_answers = np.zeros(nb_data_test, dtype=np.int32) ans_to_ix = {v: k for k, v in dataset[u'ix_to_ans'].items()} count_of_not_found = 0 for i in xrange(nb_data_test): qid = test_data[u'ques_id'][i] try: val_ans_ix = int(ans_to_ix[most_common( val_all_answers_dict[str(qid)])]) - 1 except KeyError: count_of_not_found += 1 val_ans_ix = 480 val_answers[i] = val_ans_ix print("Beware: " + str(count_of_not_found) + " number of val answers are not really correct") return dataset, img_feature, test_data