def get_train_data(args):

    dataset = {}
    train_data = {}
    # load json file
    print('loading json file...')
    with open(args.input_json) as data_file:
        data = json.load(data_file)
    for key in data.keys():
        dataset[key] = data[key]

    # load image feature
    print('loading image feature...')
    with h5py.File(args.input_img_h5, 'r') as hf:
        # -----0~82459------
        tem = hf.get('images_train')
        img_feature = np.array(tem)
    # load h5 file
    print('loading h5 file...')
    with h5py.File(args.input_ques_h5, 'r') as hf:
        # total number of training data is 215375
        # question is (26, )
        tem = hf.get('ques_train')
        train_data['question'] = np.array(tem)
        # max length is 23
        tem = hf.get('ques_length_train')
        train_data['length_q'] = np.array(tem)
        # total 82460 img
        #-----1~82460-----
        tem = hf.get('img_pos_train')
        # convert into 0~82459
        train_data['img_list'] = np.array(tem) - 1
        # answer is 1~1000
        tem = hf.get('answers')
        train_data['answers'] = np.array(tem) - 1

    print('Normalizing image feature')
    if img_norm:
        tem = np.sqrt(np.sum(np.multiply(img_feature, img_feature)))
        img_feature = np.divide(img_feature, np.tile(tem,
                                                     (1, args.img_vec_dim)))

    return dataset, img_feature, train_data
Esempio n. 2
0
def get_bert_embeddings(h5py, sent_idx, tokens):
    b_emb = []
    for i, word in enumerate(tokens):
        line = h5py.get(str(sent_idx))
        if line is None:
            raise Exception(
                f'BERT failed to find embedding: {sent_idx}, {tokens}')
        if word not in ['<ROOT>', '<unaligned>', '<eof>']:
            embedding = line[i]
            b_emb.append(embedding)
    return b_emb
Esempio n. 3
0
def read_image_header(row, img_file):
    """Read some information from the image header and write into the df row.
    """
    hdu = 0

    # Note: The next line usually works, but fitsio doesn't support CONTINUE lines, which DES
    #       image headers sometimes include.
    #h = fitsio.read_header(img_file, hdu)
    # I don't  care about any of the lines the sometimes use CONITNUE (e.g. OBSERVER), so I
    # just remove them and make the header with the rest of the entries.
    f = fitsio.FITS(img_file)
    header_list = f[hdu].read_header_list()
    header_list = [ d for d in header_list if 'CONTINUE' not in d['name'] ]
    h = fitsio.FITSHDR(header_list)
    try:
        date = h['DATE-OBS']
        date, time = date.strip().split('T',1)

        filter = h['FILTER']
        filter = filter.split()[0]

        sat = h['SATURATE']
        fwhm = h['FWHM']

        ccdnum = int(h['CCDNUM'])
        detpos = h['DETPOS'].strip()

        telra = h['TELRA']
        teldec = h['TELDEC']
        telha = h['HA']
        if galsim.__version__ >= '1.5.1':
            telra = galsim.Angle.from_hms(telra) / galsim.degrees
            teldec = galsim.Angle.from_dms(teldec) / galsim.degrees
            telha = galsim.Angle.from_hms(telha) / galsim.degrees
        else:
            telra = galsim.HMS_Angle(telra) / galsim.degrees
            teldec = galsim.DMS_Angle(teldec) / galsim.degrees
            telha = galsim.HMS_Angle(telha) / galsim.degrees

        airmass = float(h.get('AIRMASS',-999))
        sky = float(h.get('SKYBRITE',-999))
        sigsky = float(h.get('SKYSIGMA',-999))

        tiling = int(h.get('TILING',0))
        hex = int(h.get('HEX',0))

    except Exception as e:
        logger.info("Caught %s",e)
        logger.info("Cannot read header information from %s", img_file)
        raise

    row['date'] = date
    row['time'] = time
    row['sat'] = sat
    row['fits_filter'] = filter
    row['fits_fwhm'] = fwhm
    row['fits_ccdnum'] = ccdnum
    row['telra'] = telra
    row['teldec'] = teldec
    row['telha'] = telha
    row['airmass'] = airmass
    row['sky'] = sky
    row['sigsky'] = sigsky
    row['tiling'] = tiling
    row['hex'] = hex
def get_data_test(args):
    dataset = {}
    test_data = {}
    # load json file
    print('loading json file...')
    with open(args.input_json) as data_file:
        data = json.load(data_file)
    for key in data.keys():
        dataset[key] = data[key]

    # load image feature
    print('loading image feature...')
    with h5py.File(args.input_img_h5, 'r') as hf:
        # -----0~82459------
        tem = hf.get('images_test')
        img_feature = np.array(tem)
    # load h5 file
    print('loading h5 file...')
    with h5py.File(args.input_ques_h5, 'r') as hf:
        # total number of training data is 215375
        # question is (26, )
        tem = hf.get('ques_test')
        test_data['question'] = np.array(tem)
        # max length is 23
        tem = hf.get('ques_length_test')
        test_data['length_q'] = np.array(tem)
        # total 82460 img
        # -----1~82460-----
        tem = hf.get('img_pos_test')
        # convert into 0~82459
        test_data['img_list'] = np.array(tem) - 1
        # quiestion id
        tem = hf.get('question_id_test')
        test_data['ques_id'] = np.array(tem)
    # MC_answer_test
    tem = hf.get('MC_ans_test')
    test_data['MC_ans_test'] = np.array(tem)

    print('Normalizing image feature')
    if img_norm:
        tem = np.sqrt(np.sum(np.multiply(img_feature, img_feature)))
        img_feature = np.divide(img_feature, np.tile(tem,
                                                     (1, args.img_vec_dim)))

    nb_data_test = len(test_data[u'question'])
    val_all_answers_dict = json.load(open(args.ans_file))
    val_answers = np.zeros(nb_data_test, dtype=np.int32)

    ans_to_ix = {v: k for k, v in dataset[u'ix_to_ans'].items()}
    count_of_not_found = 0
    for i in xrange(nb_data_test):
        qid = test_data[u'ques_id'][i]
        try:
            val_ans_ix = int(ans_to_ix[most_common(
                val_all_answers_dict[str(qid)])]) - 1
        except KeyError:
            count_of_not_found += 1
            val_ans_ix = 480
        val_answers[i] = val_ans_ix
    print("Beware: " + str(count_of_not_found) +
          " number of val answers are not really correct")

    return dataset, img_feature, test_data