Python H5PYDataset.create_split_arrayの例、fuel.datasets.H5PYDataset.create_split_array Pythonの例

コード例 #1

0

ファイルを表示

ファイル: query_strategy.py プロジェクト: MiriamHu/ActiveBoundary

    def save_query_to_hdf5_point(self, save_path_queries_hdf5, entry_id,
                                 sample_original_space):
        """
        :return:
        """
        sample_original_space = to_vector(sample_original_space).T
        if os.path.isfile(save_path_queries_hdf5):
            with h5py.File(save_path_queries_hdf5, 'r+') as hf:
                points_dataset = hf.get('point_queries')
                already_in_points_ds = points_dataset.shape[0]
                points_dataset.resize(already_in_points_ds +
                                      sample_original_space.shape[0],
                                      axis=0)
                points_dataset[
                    already_in_points_ds:already_in_points_ds +
                    sample_original_space.shape[0], :] = sample_original_space

                entryids_dataset = hf.get('entry_ids')
                already_in_entryids_ds = entryids_dataset.len()
                entryids_dataset.resize(already_in_entryids_ds + 1, axis=0)
                entryids_dataset[
                    already_in_entryids_ds:already_in_entryids_ds +
                    1] = entry_id

                split_dict = {
                    "data": {
                        "point_queries": (0, already_in_points_ds +
                                          sample_original_space.shape[0]),
                        "entry_ids": (0, already_in_entryids_ds + 1)
                    }
                }
                hf.attrs["split"] = H5PYDataset.create_split_array(split_dict)
        else:
            # HDF5 query line save file does not exist yet!
            f = h5py.File(save_path_queries_hdf5, "w")

            points_dataset = f.create_dataset(
                'point_queries',
                sample_original_space.shape,
                maxshape=(None, sample_original_space.shape[1]),
                dtype="float32")
            points_dataset[...] = sample_original_space
            entryids_dataset = f.create_dataset('entry_ids', (1, ),
                                                maxshape=(None, ),
                                                dtype=int)
            entryids_dataset[...] = entry_id

            split_dict = {
                "data": {
                    "point_queries": (0, sample_original_space.shape[0]),
                    "entry_ids": (0, 1)
                }
            }
            f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
            f.flush()
            f.close()

コード例 #2

0

ファイルを表示

ファイル: dataset.py プロジェクト: MiriamHu/ActiveBoundary

    def save_db_point_to_hdf5(self, db_point_scaled_space):
        """
        Save a decision boundary annotation to hdf5.
        :param db_point_scaled_space: (n_samples, n_features)
        :return:
        """
        try:
            db_point_original_space = self.scaling_transformation.inverse_transform(
                db_point_scaled_space)  # shape (1,nlat)
            if os.path.isfile(self.save_path_dbpoints_hdf5):
                with h5py.File(self.save_path_dbpoints_hdf5, 'r+') as hf:
                    dbpoints_dataset = hf.get('db_points')
                    already_in_dataset = dbpoints_dataset.shape[0]
                    dbpoints_dataset.resize(already_in_dataset +
                                            db_point_original_space.shape[0],
                                            axis=0)
                    dbpoints_dataset[already_in_dataset:already_in_dataset +
                                     db_point_original_space.
                                     shape[0], :] = db_point_original_space

                    split_dict = {
                        "data": {
                            "db_points": (0, already_in_dataset +
                                          db_point_original_space.shape[0])
                        }
                    }
                    hf.attrs["split"] = H5PYDataset.create_split_array(
                        split_dict)
            else:
                # HDF5 query line save file does not exist yet!
                f = h5py.File(self.save_path_dbpoints_hdf5, "w")
                dbpoints_dataset = f.create_dataset(
                    'db_points',
                    db_point_original_space.shape,
                    maxshape=(None, db_point_original_space.shape[1]),
                    dtype="float32")
                dbpoints_dataset[...] = db_point_original_space

                split_dict = {
                    "data": {
                        "db_points": (0, db_point_original_space.shape[0])
                    }
                }
                f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
                f.flush()
                f.close()
        except Exception:
            traceback.print_exc()

コード例 #3

0

ファイルを表示

def build_contexts_ds(config, all_contexts, nsamples_train, nsamples_dev,
                      nsamples_test, nsamples_dev_big):
    logger.info('building contexts dataset')
    totals = len(all_contexts)
    ctx_dtype = h5py.special_dtype(vlen=np.dtype('int32'))
    logger.info("#contexts: %d", totals)
    with h5py.File(config['dsdir'] + "_contexts.hdf", mode='w') as fp:
        contexts = fp.create_dataset(
            'contexts',
            compression='gzip',
            data=[np.asarray(ctx.context) for ctx in all_contexts],
            shape=(totals, ),
            dtype=ctx_dtype)
        split_dict = {
            'train': {
                'contexts': (0, nsamples_train)
            },
            'dev': {
                'contexts': (nsamples_train, nsamples_train + nsamples_dev)
            },
            'test': {
                'contexts': (nsamples_train + nsamples_dev,
                             nsamples_train + nsamples_dev + nsamples_test)
            },
            'devbig': {
                'contexts':
                (nsamples_train + nsamples_dev + nsamples_test, totals)
            }
        }
        fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)

コード例 #4

0

ファイルを表示

def build_entmentions_ds(config, all_contexts, nsamples_train, nsamples_dev,
                         nsamples_test, nsamples_dev_big):
    logger.info('building entmentions dataset')
    totals = len(all_contexts)
    ctx_dtype = h5py.special_dtype(vlen=np.dtype('uint32'))
    dsdir = config['dsdir']
    ctx_entity_dtype = np.dtype([("id", np.dtype(str), 64),
                                 ("token", np.dtype(str), 64),
                                 ("position", np.dtype('uint8'))])
    with h5py.File(dsdir + "_entmentions.hdf", mode='w') as fp:
        context_entities = fp.create_dataset(
            'entmentions',
            compression='gzip',
            data=np.asarray([(ctx.entity_id, ctx.entity_str, ctx.entity_idx)
                             for ctx in all_contexts],
                            dtype=ctx_entity_dtype))
        split_dict = {
            'train': {
                'entmentions': (0, nsamples_train)
            },
            'dev': {
                'entmentions': (nsamples_train, nsamples_train + nsamples_dev)
            },
            'test': {
                'entmentions': (nsamples_train + nsamples_dev,
                                nsamples_train + nsamples_dev + nsamples_test)
            },
            'devbig': {
                'entmentions':
                (nsamples_train + nsamples_dev + nsamples_test, totals)
            }
        }
        fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)

コード例 #5

0

ファイルを表示

def build_mentions_ds(config,
                      all_contexts,
                      nsamples_train,
                      nsamples_dev,
                      nsamples_test,
                      nsamples_dev_big,
                      max_len_men=4):
    logger.info('building mentions (indices of mention words) dataset')
    totals = len(all_contexts)
    dsdir = config['dsdir']
    mentions_m = numpy.ones(shape=(totals, max_len_men), dtype='int32')
    for i, ctx in enumerate(all_contexts):
        mentions_m[i] = ctx.mention
    with h5py.File(dsdir + "_mentions.hdf", mode='w') as fp:
        mentions = fp.create_dataset('mentions',
                                     mentions_m.shape,
                                     dtype='int32')
        mentions[...] = mentions_m
        split_dict = {
            'train': {
                'mentions': (0, nsamples_train)
            },
            'dev': {
                'mentions': (nsamples_train, nsamples_train + nsamples_dev)
            },
            'test': {
                'mentions': (nsamples_train + nsamples_dev,
                             nsamples_train + nsamples_dev + nsamples_test)
            },
            'devbig': {
                'mentions':
                (nsamples_train + nsamples_dev + nsamples_test, totals)
            }
        }
        fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)

コード例 #6

0

ファイルを表示

ファイル: make_dataset.py プロジェクト: yyaghoobzadeh/figment_v2

def build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, use_lowercase=False, max_num_words=10, upto=None):
    if vectorfile == None:
        return
    word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_words = numpy.zeros(shape=(totals, max_num_words), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        input_words[i] = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
    logger.info('shape of subwords dataset: %s', input_words.shape)
    hdf5_file = dsdir + '_subwords.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('subwords', input_words.shape, dtype='int32')  # @UndefinedVariable
    
    features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False)
    features.attrs['vocabsize'] = len(word_to_idx)
    features[...] = input_words
    features.dims[0].label = 'words'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'subwords': (0, nsamples_train)},
        'dev': {'subwords': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'subwords': (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush();f.close()
    logger.info('Building subwords dataset finished. It saved in: %s', hdf5_file)
    logger.info('writing subword embeddings')
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto)
    with h5py.File(dsdir + "_subwords_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=idx2embeddings)
        vectors.attrs['vectorsize'] = vectorsize

コード例 #7

0

ファイルを表示

ファイル: make_dataset.py プロジェクト: yyaghoobzadeh/figment_v2

def build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1):
    (embeddings, word2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in word2idx:
            entvec = embeddings[word2idx[mye]]
        input_entvec[i] = entvec
    print input_entvec.shape
    hdf5_file += '_entvec.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('entvec', input_entvec.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = vectorsize
    features[...] = input_entvec
    features.dims[0].label = 'entity_vector'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'entvec': (0, nsamples_train)},
        'dev': {'entvec': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'entvec': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building entityVec dataset finished. It saved in: %s', hdf5_file)

コード例 #8

0

ファイルを表示

ファイル: celeba.py プロジェクト: Afrik/fuel

def _initialize_conversion(directory, output_path, image_shape):
    h5file = h5py.File(output_path, mode='w')
    split_dict = {
        'train': {
            'features': (0, TRAIN_STOP),
            'targets': (0, TRAIN_STOP)},
        'valid': {
            'features': (TRAIN_STOP, VALID_STOP),
            'targets': (TRAIN_STOP, VALID_STOP)},
        'test': {
            'features': (VALID_STOP, NUM_EXAMPLES),
            'targets': (VALID_STOP, NUM_EXAMPLES)}}
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    targets_dataset = h5file.create_dataset(
        'targets', (NUM_EXAMPLES, 40), dtype='uint8')
    targets_dataset.dims[0].label = 'batch'
    targets_dataset.dims[1].label = 'target'
    targets_dataset[...] = (
        numpy.loadtxt(os.path.join(directory, ATTRIBUTES_FILE), dtype='int32',
                      skiprows=2, usecols=tuple(range(1, 41))) +
        1) / 2

    features_dataset = h5file.create_dataset(
        'features', (NUM_EXAMPLES, 3) + image_shape, dtype='uint8')
    features_dataset.dims[0].label = 'batch'
    features_dataset.dims[1].label = 'channel'
    features_dataset.dims[2].label = 'height'
    features_dataset.dims[3].label = 'width'

    return h5file

コード例 #9

0

ファイルを表示

ファイル: make_dataset.py プロジェクト: yyaghoobzadeh/figment_v2

def build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, hdf5_file, embpath, emb_list, vectorsize=200, upto=-1):
    print "building hs Ngram datasets: ", emb_list
    for emb_version in emb_list:
        print emb_version
        mypath = os.path.join(embpath, emb_version)
        nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
        totals = nsamples_train + nsamples_dev + len(tstMentions) 
        vectorsize = get_vec_size(mypath+'/train.txt')
        input_hsngram_matrix = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
        input_hsngram_matrix[0:nsamples_train] = load_embmatirx(mypath+'/train.txt', len(trnMentions), vectorsize, upto)
        input_hsngram_matrix[nsamples_train:nsamples_train+nsamples_dev] = load_embmatirx(mypath+'/dev.txt', len(devMentions), vectorsize, upto)
        input_hsngram_matrix[nsamples_train+nsamples_dev:totals] = load_embmatirx(mypath+'/test.txt', len(tstMentions), vectorsize, upto)
        print input_hsngram_matrix.shape
        srcname = 'hsngram_' + emb_version
        hdf5_file = hdf5_file + '_'+ srcname + '.h5py'
        print hdf5_file
        f = h5py.File(hdf5_file, mode='w')
        features = f.create_dataset(srcname, input_hsngram_matrix.shape, dtype='float32')  # @UndefinedVariable
        features.attrs['vectorsize'] = vectorsize
        features[...] = input_hsngram_matrix
        features.dims[0].label = srcname + '_vector'
        split_dict = {
            'train': {srcname: (0, nsamples_train)},
            'dev': {srcname: (nsamples_train, nsamples_train + nsamples_dev)}, 
            'test': {srcname: (nsamples_train + nsamples_dev, totals)}}    
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.flush()
        f.close()
        logger.info('Building hinrich ngram-level embeddings of mentions finished. It saved in: %s', hdf5_file)

コード例 #10

0

ファイルを表示

ファイル: make_dataset.py プロジェクト: yyaghoobzadeh/figment_v2

def build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1):
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in voc2idx:
            entvec = embeddings[voc2idx[mye]]
        input_entvec[i] = entvec
    typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim
    ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix)
    logger.info(ent_types_cosin_matrix.shape)
    
    hdf5_file += '_tc.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('tc', ent_types_cosin_matrix.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1]
    features[...] = ent_types_cosin_matrix
    features.dims[0].label = 'types_ent_cosine'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'tc': (0, nsamples_train)},
        'dev': {'tc': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'tc': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building types-ent cosine (tc) dataset finished. It saved in: %s', hdf5_file)

コード例 #11

0

ファイルを表示

ファイル: camvid.py プロジェクト: bordesf/fuel

def _initialize_conversion(directory, output_path, image_shape):
    h5file = h5py.File(output_path, mode='w')
    split_dict = {
        'train': {
            'features': (0, TRAIN_STOP),
            'targets': (0, TRAIN_STOP)
        },
        'valid': {
            'features': (TRAIN_STOP, VALID_STOP),
            'targets': (TRAIN_STOP, VALID_STOP)
        },
        'test': {
            'features': (VALID_STOP, NUM_EXAMPLES),
            'targets': (VALID_STOP, NUM_EXAMPLES)
        }
    }
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    targets_dataset = h5file.create_dataset('targets',
                                            (NUM_EXAMPLES, ) + image_shape,
                                            dtype='uint8')
    targets_dataset.dims[0].label = 'batch'
    targets_dataset.dims[1].label = 'height'
    targets_dataset.dims[2].label = 'width'

    features_dataset = h5file.create_dataset('features',
                                             (NUM_EXAMPLES, 3) + image_shape,
                                             dtype='uint8')
    features_dataset.dims[0].label = 'batch'
    features_dataset.dims[1].label = 'channel'
    features_dataset.dims[2].label = 'height'
    features_dataset.dims[3].label = 'width'

    return h5file

コード例 #12

0

ファイルを表示

ファイル: ilsvrc2012.py プロジェクト: Scyfer/fuel

def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
    """Create datasets within a given HDF5 file.

    Parameters
    ----------
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write.
    n_train : int
        The number of training set examples.
    n_valid : int
        The number of validation set examples.
    n_test : int
        The number of test set examples.

    """
    n_total = n_train + n_valid + n_test
    n_labeled = n_train + n_valid
    splits = create_splits(n_train, n_valid, n_test)
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf5_file.create_dataset('encoded_images', shape=(n_total,),
                             dtype=vlen_dtype)
    hdf5_file.create_dataset('targets', shape=(n_labeled, 1),
                             dtype=numpy.int16)
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')

コード例 #13

0

ファイルを表示

def build_targets_ds(config, all_contexts, nsamples_train, nsamples_dev,
                     nsamples_test, nsamples_dev_big):
    logger.info("building targets dataset")
    entity_types = list(load_types(config['typefile']))
    (t2idx, _) = cmn.loadtypes(config['typefile'])
    totals = len(all_contexts)
    targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32')
    for i, ctx in enumerate(all_contexts):
        types_idx = [t2idx[t] for t in ctx.all_types if t in t2idx]
        targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx))
    dsdir = config['dsdir']
    fp = h5py.File(dsdir + '_targets.hdf', mode='w')
    targets = fp.create_dataset('targets', targets_m.shape, dtype='int32')
    targets.attrs['type_to_ix'] = yaml.dump(t2idx)
    targets[...] = targets_m
    targets.dims[0].label = 'all_types'
    split_dict = {
        'train': {
            'targets': (0, nsamples_train)
        },
        'dev': {
            'targets': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'targets': (nsamples_train + nsamples_dev,
                        nsamples_train + nsamples_dev + nsamples_test)
        },
        'devbig': {
            'targets': (nsamples_train + nsamples_dev + nsamples_test, totals)
        }
    }
    fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    fp.flush()
    fp.close()

コード例 #14

0

ファイルを表示

def build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir):
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        types_idx = [t2idx[t] for t in men.alltypes]
        targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx))
    hdf5_file = dsdir + '_targets.h5py'
    f = h5py.File(hdf5_file, mode='w')
    targets = f.create_dataset('targets', targets_m.shape, dtype='int32')
    targets.attrs['type_to_ix'] = yaml.dump(t2idx)
    targets[...] = targets_m
    targets.dims[0].label = 'all_types'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'targets': (0, nsamples_train)
        },
        'dev': {
            'targets': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'targets': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()

コード例 #15

0

ファイルを表示

ファイル: make_dataset.py プロジェクト: yyaghoobzadeh/figment_v2

def build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile=None, max_len_name=30):
    char_to_idx, idx_to_char = build_char_vocab(trnMentions) #train for characters because we only use entities names for characters
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_letters = numpy.zeros(shape=(totals, max_len_name), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        input_letters[i] = get_ngram_seq(char_to_idx, name, max_len_name)
    print input_letters.shape
    fuelfile = dsdir +'_letters.h5py'
    f = h5py.File(fuelfile, mode='w')
    features = f.create_dataset('letters', input_letters.shape, dtype='int32')  # @UndefinedVariable
    features.attrs['voc2idx'] = yaml.dump(char_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_char, default_flow_style=False)
    features.attrs['vocabsize'] = len(char_to_idx)
    features[...] = input_letters
    features.dims[0].label = 'letters'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'letters': (0, nsamples_train)},
        'dev': {'letters': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'letters': (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('building letters dataset finished. It saved in: %s', fuelfile)
    if vectorfile is None: 
        return
    embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=char_to_idx, num=-1)
    logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1])
    with h5py.File(dsdir + "_letters_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=embeddings)
        vectors.attrs['vectorsize'] = vectorsize

コード例 #16

0

ファイルを表示

ファイル: test_celeba.py プロジェクト: zhoujian1210/fuel

def test_celeba():
    data_path = config.data_path
    try:
        config.data_path = '.'
        f = h5py.File('celeba_64.hdf5', 'w')
        f['features'] = numpy.arange(10 * 3 * 64 * 64, dtype='uint8').reshape(
            (10, 3, 64, 64))
        f['targets'] = numpy.arange(10 * 40, dtype='uint8').reshape((10, 40))
        split_dict = {
            'train': {
                'features': (0, 6),
                'targets': (0, 6)
            },
            'valid': {
                'features': (6, 8),
                'targets': (6, 8)
            },
            'test': {
                'features': (8, 10),
                'targets': (8, 10)
            }
        }
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.close()
        dataset = CelebA(which_format='64', which_sets=('train', ))
        assert_equal(dataset.filename, 'celeba_64.hdf5')
    finally:
        config.data_path = data_path
        os.remove('celeba_64.hdf5')

コード例 #17

0

ファイルを表示

def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
    """Create datasets within a given HDF5 file.

    Parameters
    ----------
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write.
    n_train : int
        The number of training set examples.
    n_valid : int
        The number of validation set examples.
    n_test : int
        The number of test set examples.

    """
    n_total = n_train + n_valid + n_test
    n_labeled = n_train + n_valid
    splits = create_splits(n_train, n_valid, n_test)
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf5_file.create_dataset('encoded_images',
                             shape=(n_total, ),
                             dtype=vlen_dtype)
    hdf5_file.create_dataset('targets',
                             shape=(n_labeled, 1),
                             dtype=numpy.int16)
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')

コード例 #18

0

ファイルを表示

def build_hsNgram_ds(config,
                     trnMentions,
                     devMentions,
                     tstMentions,
                     t2idx,
                     hdf5_file,
                     embpath,
                     emb_list,
                     vectorsize=200,
                     upto=-1):
    print "building hs Ngram datasets: ", emb_list
    for emb_version in emb_list:
        print emb_version
        mypath = os.path.join(embpath, emb_version)
        nsamples_train = len(trnMentions)
        nsamples_dev = len(devMentions)
        totals = nsamples_train + nsamples_dev + len(tstMentions)
        vectorsize = get_vec_size(mypath + '/train.txt')
        input_hsngram_matrix = numpy.zeros(shape=(totals, vectorsize),
                                           dtype='float32')
        input_hsngram_matrix[0:nsamples_train] = load_embmatirx(
            mypath + '/train.txt', len(trnMentions), vectorsize, upto)
        input_hsngram_matrix[nsamples_train:nsamples_train +
                             nsamples_dev] = load_embmatirx(
                                 mypath + '/dev.txt', len(devMentions),
                                 vectorsize, upto)
        input_hsngram_matrix[nsamples_train +
                             nsamples_dev:totals] = load_embmatirx(
                                 mypath + '/test.txt', len(tstMentions),
                                 vectorsize, upto)
        print input_hsngram_matrix.shape
        srcname = 'hsngram_' + emb_version
        hdf5_file = hdf5_file + '_' + srcname + '.h5py'
        print hdf5_file
        f = h5py.File(hdf5_file, mode='w')
        features = f.create_dataset(srcname,
                                    input_hsngram_matrix.shape,
                                    dtype='float32')  # @UndefinedVariable
        features.attrs['vectorsize'] = vectorsize
        features[...] = input_hsngram_matrix
        features.dims[0].label = srcname + '_vector'
        split_dict = {
            'train': {
                srcname: (0, nsamples_train)
            },
            'dev': {
                srcname: (nsamples_train, nsamples_train + nsamples_dev)
            },
            'test': {
                srcname: (nsamples_train + nsamples_dev, totals)
            }
        }
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.flush()
        f.close()
        logger.info(
            'Building hinrich ngram-level embeddings of mentions finished. It saved in: %s',
            hdf5_file)

コード例 #19

0

ファイルを表示

def build_letters_ds(trnMentions,
                     devMentions,
                     tstMentions,
                     t2idx,
                     dsdir,
                     vectorfile=None,
                     max_len_name=30):
    char_to_idx, idx_to_char = build_char_vocab(
        trnMentions
    )  #train for characters because we only use entities names for characters
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_letters = numpy.zeros(shape=(totals, max_len_name), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        input_letters[i] = get_ngram_seq(char_to_idx, name, max_len_name)
    print input_letters.shape
    fuelfile = dsdir + '_letters.h5py'
    f = h5py.File(fuelfile, mode='w')
    features = f.create_dataset('letters', input_letters.shape,
                                dtype='int32')  # @UndefinedVariable
    features.attrs['voc2idx'] = yaml.dump(char_to_idx,
                                          default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_char,
                                          default_flow_style=False)
    features.attrs['vocabsize'] = len(char_to_idx)
    features[...] = input_letters
    features.dims[0].label = 'letters'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'letters': (0, nsamples_train)
        },
        'dev': {
            'letters': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'letters': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('building letters dataset finished. It saved in: %s', fuelfile)
    if vectorfile is None:
        return
    embeddings, vectorsize = read_embeddings_vocab(vectorfile,
                                                   vocab=char_to_idx,
                                                   num=-1)
    logger.info('size of embedding matrix to save is: (%d, %d)',
                embeddings.shape[0], embeddings.shape[1])
    with h5py.File(dsdir + "_letters_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors',
                                    compression='gzip',
                                    data=embeddings)
        vectors.attrs['vectorsize'] = vectorsize

コード例 #20

0

ファイルを表示

def fill_hdf5_file(h5file, data):
    """Fills an HDF5 file in a H5PYDataset-compatible manner.

    Parameters
    ----------
    h5file : :class:`h5py.File`
        File handle for an HDF5 file.
    data : tuple of tuple
        One element per split/source pair. Each element consists of a
        tuple of (split_name, source_name, data_array, comment), where

        * 'split_name' is a string identifier for the split name
        * 'source_name' is a string identifier for the source name
        * 'data_array' is a :class:`numpy.ndarray` containing the data
          for this split/source pair
        * 'comment' is a comment string for the split/source pair

        The 'comment' element can optionally be omitted.

    """
    # Check that all sources for a split have the same length
    split_names = set(split_tuple[0] for split_tuple in data)
    for name in split_names:
        lengths = [
            len(split_tuple[2]) for split_tuple in data
            if split_tuple[0] == name
        ]
        if not all(le == lengths[0] for le in lengths):
            raise ValueError("split '{}' has sources that ".format(name) +
                             "vary in length")

    # Initialize split dictionary
    split_dict = dict([(split_name, {}) for split_name in split_names])

    # Compute total source lengths and check that splits have the same dtype
    # across a source
    source_names = set(split_tuple[1] for split_tuple in data)
    for name in source_names:
        splits = [s for s in data if s[1] == name]
        indices = numpy.cumsum([0] + [len(s[2]) for s in splits])
        if not all(s[2].dtype == splits[0][2].dtype for s in splits):
            raise ValueError("source '{}' has splits that ".format(name) +
                             "vary in dtype")
        if not all(s[2].shape[1:] == splits[0][2].shape[1:] for s in splits):
            raise ValueError("source '{}' has splits that ".format(name) +
                             "vary in shapes")
        dataset = h5file.create_dataset(
            name, (sum(len(s[2]) for s in splits), ) + splits[0][2].shape[1:],
            dtype=splits[0][2].dtype)
        dataset[...] = numpy.concatenate([s[2] for s in splits], axis=0)
        for i, j, s in zip(indices[:-1], indices[1:], splits):
            if len(s) == 4:
                split_dict[s[0]][name] = (i, j, None, s[3])
            else:
                split_dict[s[0]][name] = (i, j)
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

コード例 #21

0

ファイルを表示

    def save_decision_boundary(self, w, b):
        """
        :return:
        """
        w = to_vector(w).T
        if os.path.isfile(self.save_path_boundaries):
            with h5py.File(self.save_path_boundaries, 'r+') as hf:
                w_dataset = hf.get('w')
                already_in_w_ds = w_dataset.shape[0]
                w_dataset.resize(already_in_w_ds + w.shape[0], axis=0)
                w_dataset[already_in_w_ds:already_in_w_ds + w.shape[0], :] = w

                b_dataset = hf.get('b')
                already_in_b_ds = b_dataset.len()
                b_dataset.resize(already_in_b_ds + 1, axis=0)
                b_dataset[already_in_b_ds:already_in_b_ds + 1] = b

                split_dict = {
                    "data": {
                        "w": (0, already_in_w_ds + w.shape[0]),
                        "b": (0, already_in_b_ds + 1)
                    }
                }
                hf.attrs["split"] = H5PYDataset.create_split_array(split_dict)
        else:
            # HDF5 query line save file does not exist yet!
            f = h5py.File(self.save_path_boundaries, "w")

            w_dataset = f.create_dataset('w',
                                         w.shape,
                                         maxshape=(None, w.shape[1]),
                                         dtype="float32")
            w_dataset[...] = w

            b_dataset = f.create_dataset('b', (1, ),
                                         maxshape=(None, ),
                                         dtype="float32")
            b_dataset[...] = b

            split_dict = {"data": {"w": (0, w.shape[0]), "b": (0, 1)}}
            f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
            f.flush()
            f.close()

コード例 #22

0

ファイルを表示

ファイル: base.py プロジェクト: ZhangAustin/attention-lvcsr

def fill_hdf5_file(h5file, data):
    """Fills an HDF5 file in a H5PYDataset-compatible manner.

    Parameters
    ----------
    h5file : :class:`h5py.File`
        File handle for an HDF5 file.
    data : tuple of tuple
        One element per split/source pair. Each element consists of a
        tuple of (split_name, source_name, data_array, comment), where

        * 'split_name' is a string identifier for the split name
        * 'source_name' is a string identifier for the source name
        * 'data_array' is a :class:`numpy.ndarray` containing the data
          for this split/source pair
        * 'comment' is a comment string for the split/source pair

        The 'comment' element can optionally be omitted.

    """
    # Check that all sources for a split have the same length
    split_names = set(split_tuple[0] for split_tuple in data)
    for name in split_names:
        lengths = [len(split_tuple[2]) for split_tuple in data
                   if split_tuple[0] == name]
        if not all(l == lengths[0] for l in lengths):
            raise ValueError("split '{}' has sources that ".format(name) +
                             "vary in length")

    # Initialize split dictionary
    split_dict = dict([(split_name, {}) for split_name in split_names])

    # Compute total source lengths and check that splits have the same dtype
    # across a source
    source_names = set(split_tuple[1] for split_tuple in data)
    for name in source_names:
        splits = [s for s in data if s[1] == name]
        indices = numpy.cumsum([0] + [len(s[2]) for s in splits])
        if not all(s[2].dtype == splits[0][2].dtype for s in splits):
            raise ValueError("source '{}' has splits that ".format(name) +
                             "vary in dtype")
        if not all(s[2].shape[1:] == splits[0][2].shape[1:] for s in splits):
            raise ValueError("source '{}' has splits that ".format(name) +
                             "vary in shapes")
        dataset = h5file.create_dataset(
            name, (sum(len(s[2]) for s in splits),) + splits[0][2].shape[1:],
            dtype=splits[0][2].dtype)
        dataset[...] = numpy.concatenate([s[2] for s in splits], axis=0)
        for i, j, s in zip(indices[:-1], indices[1:], splits):
            if len(s) == 4:
                split_dict[s[0]][name] = (i, j, None, s[3])
            else:
                split_dict[s[0]][name] = (i, j)
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

コード例 #23

0

ファイルを表示

def make_fuel_dataset(file_name, training_set_data, testing_set_data):
    """

    :param file_name:
    :type file_name:
    :param training_set_data:
    :type training_set_data: list[dict[str, numpy.core.multiarray.ndarray]]
    :param testing_set_data:
    :type testing_set_data: list[dict[str, numpy.core.multiarray.ndarray]]
    :return:
    :rtype:
    """
    f = h5py.File(file_name, mode='w')

    nb_training_examples = len(training_set_data)
    nb_testing_examples = len(testing_set_data)
    nb_examples = nb_training_examples + nb_testing_examples

    ts = [
          ('audio_features', 'audio_features', 'float32'),
          ('targets_weak_alarm', 'classes_alarm_weak', 'uint8'),
          ('targets_weak_vehicle', 'classes_vehicle_weak', 'uint8'),
          ('targets_strong_alarm', 'classes_alarm_strong', 'uint8'),
          ('targets_strong_vehicle', 'classes_vehicle_strong', 'uint8')
          ]

    shape_labels = ['batch'.encode('utf8'), 'time_frames'.encode('utf8'), 'features'.encode('utf8')]
    datasets = []
    datasets_shapes = []
    datasets_shapes_labels = []
    for t in ts:
        datasets.append(f.create_dataset(t[0], (nb_examples, ), dtype=h5py.special_dtype(vlen=np.dtype(t[2]))))
        datasets[-1][...] = [entry[t[1]].flatten() for entry in training_set_data + testing_set_data]

        datasets_shapes.append(f.create_dataset('{}_shapes'.format(t[0]), (nb_examples, 3), dtype='int32'))
        datasets_shapes[-1][...] = np.array([entry[t[1]].reshape((1, ) + entry[t[1]].shape).shape
                                             for entry in training_set_data + testing_set_data])
        datasets[-1].dims.create_scale(datasets_shapes[-1], 'shapes')
        datasets[-1].dims[0].attach_scale(datasets_shapes[-1])

        datasets_shapes_labels.append(f.create_dataset('{}_shape_labels'.format(t[0]), (3, ), dtype='S11'))
        datasets_shapes_labels[-1][...] = shape_labels
        datasets[-1].dims.create_scale(datasets_shapes_labels[-1], 'shape_labels')
        datasets[-1].dims[0].attach_scale(datasets_shapes_labels[-1])

    split_dict = {
        'train': {t[0]: (0, nb_training_examples) for t in ts},
        'test': {t[0]: (nb_training_examples, nb_examples) for t in ts}
    }

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()

コード例 #24

0

ファイルを表示

ファイル: base.py プロジェクト: yccai/scikit-chem

    def save_splits(self):
        """ Save the splits to the data file. """

        logger.info('Producing dataset splits...')
        for split in self.splits:
            split.save()
        split_dict = {split.name: split.to_dict() for split in self.splits}
        splits = H5PYDataset.create_split_array(split_dict)
        logger.debug('split: %s', splits)
        logger.info('Saving splits...')
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            self.data_file.attrs['split'] = splits

コード例 #25

0

ファイルを表示

def build_typecosine_ds(trnMentions,
                        devMentions,
                        tstMentions,
                        t2idx,
                        hdf5_file,
                        vectorfile,
                        upto=-1):
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in voc2idx:
            entvec = embeddings[voc2idx[mye]]
        input_entvec[i] = entvec
    typevecmatrix = buildtypevecmatrix(
        t2idx, embeddings, vectorsize,
        voc2idx)  # a matrix with size: 102 * dim
    ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix)
    logger.info(ent_types_cosin_matrix.shape)

    hdf5_file += '_tc.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('tc',
                                ent_types_cosin_matrix.shape,
                                dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1]
    features[...] = ent_types_cosin_matrix
    features.dims[0].label = 'types_ent_cosine'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'tc': (0, nsamples_train)
        },
        'dev': {
            'tc': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'tc': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info(
        'Building types-ent cosine (tc) dataset finished. It saved in: %s',
        hdf5_file)

コード例 #26

0

ファイルを表示

def make_lr_fuel_file(outfile, inda, indb, indc, X, y):
    """
    Makes a FUEL dataset that combines both left and right features.
    :param outfile:
    :param inda:
    :param indb:
    :param indc:
    :param X:
    :param y:
    :return:
    """
    # Make the pytables table:
    f = h5py.File(outfile, mode='w')
    targets = f.create_dataset('targets', y.shape, dtype='int8')
    l_features = f.create_dataset('l_features', X['l'].shape, dtype='int8')
    r_features = f.create_dataset('r_features', X['r'].shape, dtype='int8')

    # Load the data into it:
    l_features[...] = X['l']
    r_features[...] = X['r']
    targets[...] = y

    # Label the axis:
    targets.dims[0].label = 'sample'
    targets.dims[1].label = 'class'
    l_features.dims[0].label = 'sample'
    l_features.dims[1].label = 'feature'
    r_features.dims[0].label = 'sample'
    r_features.dims[1].label = 'feature'

    # Make a "splits" dictionary as required by Fuel
    split_dict = {
        'train': {'l_features': (0, inda),
                  'r_features': (0, inda),
                  'targets': (0, inda)},
        'valid': {'l_features': (inda, inda + indb),
                  'r_features': (inda, inda + indb),
                  'targets': (inda, inda + indb)},
        'test': {'l_features': (inda + indb, inda + indb + indc),
                 'r_features': (inda + indb, inda + indb + indc),
                 'targets': (inda + indb, inda + indb + indc)},
    }

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    # Save this new dataset to file
    f.flush()
    f.close()

コード例 #27

0

ファイルを表示

ファイル: test_svhn.py プロジェクト: Afrik/fuel

def test_svhn():
    data_path = config.data_path
    try:
        config.data_path = '.'
        f = h5py.File('svhn_format_2.hdf5', 'w')
        f['features'] = numpy.arange(100, dtype='uint8').reshape((10, 10))
        f['targets'] = numpy.arange(10, dtype='uint8').reshape((10, 1))
        split_dict = {'train': {'features': (0, 8), 'targets': (0, 8)},
                      'test': {'features': (8, 10), 'targets': (8, 10)}}
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.close()
        dataset = SVHN(which_format=2, which_sets=('train',))
        assert_equal(dataset.filename, 'svhn_format_2.hdf5')
    finally:
        config.data_path = data_path
        os.remove('svhn_format_2.hdf5')

コード例 #28

0

ファイルを表示

def generate_h5(labels, name='breakhis_40PX', path='storage/breakhis/40PX', target_size=(256, 256), img_size=[256, 256]):
    n_samples = len(labels)
    n_test, n_dev = int(len(labels) * .3), int(len(labels) * .2)
    n_train = n_samples - n_test - n_dev

    f = h5py.File(f'{name}.hdf5', mode='w')

    _images = f.create_dataset('images', [len(labels)] + img_size + [3], dtype='int32')
    _labels = f.create_dataset('labels', [len(labels)], dtype='int32')

    _images.dims[0].label = 'batch'
    _images.dims[1].label = 'width'
    _images.dims[2].label = 'height'
    _images.dims[3].label = 'channel'

    _labels.dims[0].label = 'batch'

    B = np.copy(np.array(list(labels.keys())))
    rng = np.random.RandomState()
    perm = rng.permutation(B)

    for i in tqdm(range(perm.shape[0])):
        name = perm[i]
        image_path = f"{path}/{name}"
        img = load_img(image_path, target_size=target_size)
        _images[i] = img_to_array(img)
        _labels[i] = labels[name]

    split_dict = {
        'train': {
            'images': (0, n_train),
            'labels': (0, n_train)
        },
        'dev': {
            'images': (n_train, n_train + n_dev),
            'labels': (n_train, n_train + n_dev)
        },
        'test': {
            'images': (n_train + n_dev, n_samples),
            'labels': (n_train + n_dev, n_samples)
        }
    }

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()

コード例 #29

0

ファイルを表示

def test_camvid():
    data_path = config.data_path
    try:
        config.data_path = '.'
        f = h5py.File('camvid.hdf5', 'w')
        f['features'] = numpy.arange(
            10 * 3 * 360 * 480, dtype='uint8').reshape((10, 3, 360, 480))
        f['targets'] = numpy.arange(
            10 * 360 * 480, dtype='uint8').reshape((10, 360, 480))
        split_dict = {'train': {'features': (0, 6), 'targets': (0, 6)},
                      'valid': {'features': (6, 8), 'targets': (6, 8)},
                      'test': {'features': (8, 10), 'targets': (8, 10)}}
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.close()
        dataset = Camvid(which_sets=('train',))
        assert_equal(dataset.filename, 'camvid.hdf5')
    finally:
        config.data_path = data_path

コード例 #30

0

ファイルを表示

 def save_encoded_test_data(save_file, X_test_encoded, y_test):
     import h5py
     f = h5py.File(save_file, mode="w")
     features = f.create_dataset("features",
                                 X_test_encoded.shape,
                                 dtype="float32")
     targets = f.create_dataset("targets", y_test.shape, dtype="uint8")
     features[...] = X_test_encoded
     targets[...] = y_test
     split_dict = {
         "test": {
             "features": (0, X_test_encoded.shape[0]),
             "targets": (0, X_test_encoded.shape[0])
         }
     }
     f.attrs["split"] = H5PYDataset.create_split_array(split_dict)
     f.flush()
     f.close()

コード例 #31

0

ファイルを表示

ファイル: test_celeba.py プロジェクト: Afrik/fuel

def test_celeba():
    data_path = config.data_path
    try:
        config.data_path = '.'
        f = h5py.File('celeba_64.hdf5', 'w')
        f['features'] = numpy.arange(
            10 * 3 * 64 * 64, dtype='uint8').reshape((10, 3, 64, 64))
        f['targets'] = numpy.arange(
            10 * 40, dtype='uint8').reshape((10, 40))
        split_dict = {'train': {'features': (0, 6), 'targets': (0, 6)},
                      'valid': {'features': (6, 8), 'targets': (6, 8)},
                      'test': {'features': (8, 10), 'targets': (8, 10)}}
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.close()
        dataset = CelebA(which_format='64', which_sets=('train',))
        assert_equal(dataset.filename, 'celeba_64.hdf5')
    finally:
        config.data_path = data_path
        os.remove('celeba_64.hdf5')

コード例 #32

0

ファイルを表示

ファイル: make_dataset.py プロジェクト: yyaghoobzadeh/figment_v2

def build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorfile, use_lowercase=True, upto=None):
    if ent2tfidf_features_path == None:
        print "Warning: ignoring tfidf features building..."
        return
    ent2features = load_ent2features(ent2tfidf_features_path)
    word_to_idx, idx_to_word = build_voc_from_features(ent2features)
    logger.info('tfidf desc features vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_features = numpy.zeros(shape=(totals, len(ent2features.values()[0])), dtype='int32')
    ent_no_emb = 0
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        if men.entityId not in ent2features:
            ent_no_emb += 1
            continue
        features = ent2features[men.entityId]
        input_features[i] = get_ngram_seq(word_to_idx, features, max_len=input_features.shape[1])
    logger.info('shape of tfidf input dataset: %s', input_features.shape)
    logger.info('number of entities without embeddings: %d', ent_no_emb)
    hdf5_file = dsdir + '_desc_features.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('desc_features', input_features.shape, dtype='int32')  # @UndefinedVariable
    
    features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False)
    features.attrs['vocabsize'] = len(word_to_idx)
    features[...] = input_features
    features.dims[0].label = 'description_features'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'desc_features': (0, nsamples_train)},
        'dev': {'desc_features': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'desc_features': (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush();f.close()
    
    logger.info('Building desc_features dataset finished. It saved in: %s', hdf5_file)
    logger.info('writing word embeddings')
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto)
    print "embeddings shape: ", idx2embeddings.shape
    with h5py.File(dsdir + "_desc_features_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=idx2embeddings)
        vectors.attrs['vectorsize'] = vectorsize

コード例 #33

0

ファイルを表示

def build_entvec_ds(trnMentions,
                    devMentions,
                    tstMentions,
                    t2idx,
                    hdf5_file,
                    vectorfile,
                    upto=-1):
    (embeddings, word2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in word2idx:
            entvec = embeddings[word2idx[mye]]
        input_entvec[i] = entvec
    print input_entvec.shape
    hdf5_file += '_entvec.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('entvec', input_entvec.shape,
                                dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = vectorsize
    features[...] = input_entvec
    features.dims[0].label = 'entity_vector'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'entvec': (0, nsamples_train)
        },
        'dev': {
            'entvec': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'entvec': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building entityVec dataset finished. It saved in: %s',
                hdf5_file)

コード例 #34

0

ファイルを表示

def make_one_sided_fuel_file(outfile, inda, indb, indc, X, y, side):
    """
    Makes a dataset that includes only a single side of features.
    :param outfile:
    :param inda:
    :param indb:
    :param indc:
    :param X:
    :param y:
    :param side:
    :return:
    """
    # Make the pytables table:
    f = h5py.File(outfile, mode='w')
    targets = f.create_dataset('targets', y.shape, dtype='int8')
    features = f.create_dataset('{}_features'.format(side), X.shape, dtype='int8')

    # Load the data into it:
    features[...] = X
    targets[...] = y

    # Label the axis:
    targets.dims[0].label = 'sample'
    targets.dims[1].label = 'class'
    features.dims[0].label = 'sample'
    features.dims[1].label = 'feature'

    # Make a "splits" dictionary as required by Fuel
    split_dict = {
        'train': {'{}_features'.format(side): (0, inda),
                  'targets': (0, inda)},
        'valid': {'{}_features'.format(side): (inda, inda + indb),
                  'targets': (inda, inda + indb)},
        'test': {'{}_features'.format(side): (inda + indb, inda + indb + indc),
                 'targets': (inda + indb, inda + indb + indc)},
    }

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    # Save this new dataset to file
    f.flush()
    f.close()

コード例 #35

0

ファイルを表示

ファイル: make_dataset.py プロジェクト: yyaghoobzadeh/figment_v2

def build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir):
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        types_idx = [t2idx[t] for t in men.alltypes] 
        targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx))
    hdf5_file = dsdir + '_targets.h5py'
    f = h5py.File(hdf5_file, mode='w')
    targets = f.create_dataset('targets', targets_m.shape, dtype='int32')
    targets.attrs['type_to_ix'] = yaml.dump(t2idx)
    targets[...] = targets_m
    targets.dims[0].label = 'all_types'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'targets': (0, nsamples_train)},
        'dev': {'targets': (nsamples_train, nsamples_train + nsamples_dev)},
        'test': {'targets': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()

コード例 #36

0

ファイルを表示

    def save_indices_unlabeled(self, opt, unlabeled_indices):
        path_indices_hdf5 = os.path.join(
            opt.save_path,
            os.path.normpath(opt.save_path) + "_unlabeled_indices.hdf5")
        if os.path.isfile(path_indices_hdf5):
            print "This file already exists %s" % path_indices_hdf5
            quit(0)
        f = h5py.File(path_indices_hdf5, "w")

        unlabeled_indices_dataset = f.create_dataset('unlabeled_indices',
                                                     unlabeled_indices.shape,
                                                     maxshape=(None, ))
        unlabeled_indices_dataset[...] = unlabeled_indices

        split_dict = {
            "data": {
                "unlabeled_indices": (0, len(unlabeled_indices))
            }
        }
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.flush()
        f.close()

コード例 #37

0

ファイルを表示

ファイル: make_dataset.py プロジェクト: yyaghoobzadeh/figment_v2

def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4):
    word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto)
    
    input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
        avgvec = numpy.zeros(shape=(vectorsize))
        for ii in seq_words:
            avgvec += idx2embeddings[ii]
        avgvec /= len(seq_words)
        input_avg[i] = avgvec
    
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim
    words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix)
    logger.info(words_types_cosin_matrix.shape)
     
    dsdir += '_tcwords.h5py'
    f = h5py.File(dsdir, mode='w')
    features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1]
    features[...] = words_types_cosin_matrix
    features.dims[0].label = 'words_types_cosine'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'tcwords': (0, nsamples_train)},
        'dev': {'tcwords': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'tcwords': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)

コード例 #38

0

ファイルを表示

    def writeDataset(self, hdf5, hdf5_filepath, testImages, testLabels,
                     testNames):
        num_examples = len(testImages)

        # store images
        image_size = (256, 256)
        array_shape = (len(testLabels), 3) + image_size
        ds_images = hdf5.create_dataset("images", array_shape, dtype=np.uint8)
        ds_images.dims[0].label = "batch"
        ds_images.dims[1].label = "channel"
        ds_images.dims[2].label = "height"
        ds_images.dims[3].label = "width"

        # write images to the disk
        for i, filename in tqdm(enumerate(testImages),
                                total=num_examples,
                                desc=hdf5_filepath):
            raw_image = cv2.imread(filename, cv2.IMREAD_COLOR)  # BGR image
            image = self.preprocess(raw_image, image_size)
            ds_images[i] = image

        # store the targets (class labels)
        targets = np.array(testLabels, np.int32).reshape(num_examples, 1)
        ds_targets = hdf5.create_dataset("targets", data=targets)
        ds_targets.dims[0].label = "batch"
        ds_targets.dims[1].label = "class_labels"

        # specify the splits
        uniqueLabels = list(np.unique(testLabels))
        middleLabel = uniqueLabels[int(np.round(len(uniqueLabels) / 2))]
        test_head = testLabels.index(middleLabel)
        split_train, split_test = (0, test_head), (test_head, num_examples)
        split_dict = dict(train=dict(images=split_train, targets=split_train),
                          test=dict(images=split_test, targets=split_test))
        hdf5.attrs["split"] = H5PYDataset.create_split_array(split_dict)

        hdf5.flush()
        hdf5.close()

コード例 #39

0

ファイルを表示

ファイル: celeba.py プロジェクト: IshmaelBelghazi/fuel

def _initialize_conversion(directory, output_path, image_shape):
    h5file = h5py.File(output_path, mode='w')
    split_dict = {
        'train': {
            'features': (0, TRAIN_STOP),
            'targets': (0, TRAIN_STOP)
        },
        'valid': {
            'features': (TRAIN_STOP, VALID_STOP),
            'targets': (TRAIN_STOP, VALID_STOP)
        },
        'test': {
            'features': (VALID_STOP, NUM_EXAMPLES),
            'targets': (VALID_STOP, NUM_EXAMPLES)
        }
    }
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    targets_dataset = h5file.create_dataset('targets', (NUM_EXAMPLES, 40),
                                            dtype='uint8')
    targets_dataset.dims[0].label = 'batch'
    targets_dataset.dims[1].label = 'target'
    targets_dataset[
        ...] = (numpy.loadtxt(os.path.join(directory, ATTRIBUTES_FILE),
                              dtype='int32',
                              skiprows=2,
                              usecols=tuple(range(1, 41))) + 1) / 2

    features_dataset = h5file.create_dataset('features',
                                             (NUM_EXAMPLES, 3) + image_shape,
                                             dtype='uint8')
    features_dataset.dims[0].label = 'batch'
    features_dataset.dims[1].label = 'channel'
    features_dataset.dims[2].label = 'height'
    features_dataset.dims[3].label = 'width'

    return h5file

コード例 #40

0

ファイルを表示

ファイル: make_dataset.py プロジェクト: yyaghoobzadeh/figment_v2

def build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, ngram, max_num_ngrams=98, upto=-1):
    ngram_to_idx, idx_to_word, name2ngrams = build_ngram_vocab(trnMentions+devMentions+tstMentions,ngram=ngram, MIN_FREQ=5) #train for characters because we only use entities names for characters
    logger.info('ngram%d vocab size: %d', ngram, len(ngram_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_words = numpy.zeros(shape=(totals, max_num_ngrams), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        ngrams = name2ngrams[name]
        input_words[i] = get_ngram_seq(ngram_to_idx, ngrams, max_len=max_num_ngrams)
    print input_words.shape
    ngram_label = 'ngrams' + str(ngram)
    hdf5_file = dsdir + '_ngrams'+str(ngram)+'.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset(ngram_label, input_words.shape, dtype='int32')  # @UndefinedVariable
    
    features.attrs['voc2idx'] = yaml.dump(ngram_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False)
    features.attrs['vocabsize'] = len(ngram_to_idx)
    features[...] = input_words
    features.dims[0].label = ngram_label
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {ngram_label: (0, nsamples_train)},
        'dev': {ngram_label: (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {ngram_label: (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush();f.close()
    logger.info('Building ngram%d dataset finished. It saved in: %s', ngram, hdf5_file)
    if vectorfile is None or vectorfile == '': 
        return
    logger.info('Now, writing ngram embeddings')
    embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=ngram_to_idx, num=upto)
    logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1])
    with h5py.File(dsdir + "_" + ngram_label + "_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=embeddings)
        vectors.attrs['vectorsize'] = vectorsize

コード例 #41

0

ファイルを表示

def test_svhn():
    data_path = config.data_path
    try:
        config.data_path = '.'
        f = h5py.File('svhn_format_2.hdf5', 'w')
        f['features'] = numpy.arange(100, dtype='uint8').reshape((10, 10))
        f['targets'] = numpy.arange(10, dtype='uint8').reshape((10, 1))
        split_dict = {
            'train': {
                'features': (0, 8),
                'targets': (0, 8)
            },
            'test': {
                'features': (8, 10),
                'targets': (8, 10)
            }
        }
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.close()
        dataset = SVHN(which_format=2, which_sets=('train', ))
        assert_equal(dataset.filename, 'svhn_format_2.hdf5')
    finally:
        config.data_path = data_path
        os.remove('svhn_format_2.hdf5')

コード例 #42

0

ファイルを表示

ファイル: kaggle-to-hdf5.py プロジェクト: adetante/ml-kaggle

f = h5py.File('../../data/dataset.hdf5', mode='w')
images = f.create_dataset('images', train_feature.shape, dtype='float32')
targets = f.create_dataset('targets', train_target.shape, dtype='float32')

images[...] = train_feature
targets[...] = train_target

split_dict = {
    'train': {
        'images': (0, train_feature.shape[0]),
        'targets': (0, train_target.shape[0])
    }
}

f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

f.flush()
f.close()

train_set = H5PYDataset('../../data/dataset.hdf5', which_sets=('train',))
#data_stream = DataStream(dataset=train_set, iteration_scheme=scheme)

#state = train_set.open()
scheme = ShuffledScheme(examples=train_set.num_examples, batch_size=4)

data_stream = DataStream(dataset=train_set, iteration_scheme=scheme)
for data in data_stream.get_epoch_iterator():
    print(data[0], data[1])

standardized_stream = ScaleAndShift(data_stream=data_stream,

コード例 #43

0

ファイルを表示

ファイル: makeds_withNgram.py プロジェクト: colinsongf/figment-multi

split_dict = {
    'train': {
        'letters': (0, nsamples_train),
        'w2v_entvec': (0, nsamples_train),
        'w2v_mention': (0, nsamples_train),
        'w2v_typecos': (0, nsamples_train),
        'nsl_features': (0, nsamples_train),
        'targets': (0, nsamples_train)
    },
    'dev': {
        'letters': (nsamples_train, nsamples_train + nsamples_dev),
        'w2v_entvec': (nsamples_train, nsamples_train + nsamples_dev),
        'w2v_mention': (nsamples_train, nsamples_train + nsamples_dev),
        'w2v_typecos': (nsamples_train, nsamples_train + nsamples_dev),
        'nsl_features': (nsamples_train, nsamples_train + nsamples_dev),
        'targets': (nsamples_train, nsamples_train + nsamples_dev)
    },
    'test': {
        'letters': (nsamples_train + nsamples_dev, nsamples),
        'w2v_entvec': (nsamples_train + nsamples_dev, nsamples),
        'w2v_mention': (nsamples_train + nsamples_dev, nsamples),
        'w2v_typecos': (nsamples_train + nsamples_dev, nsamples),
        'nsl_features': (nsamples_train + nsamples_dev, nsamples),
        'targets': (nsamples_train + nsamples_dev, nsamples)
    }
}

f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
f.flush()
f.close()

コード例 #44

0

ファイルを表示

ファイル: to_h5py.py プロジェクト: SunLinJie/DLMeetupSept2015

def write_h5py_dataset(
    nested_dataset_dict,
    sources_dim_labels,
    file_path,
    dtype=config.floatX
):
    """ Creates a h5py file based dataset (writes this to disk).

    Parameters
    ----------
    nested_dataset_dict : {
        <subset>: {
            <source>: numpy.ndarray()
        }
    }, where
        <subset> : str
        <source> : str
    file_path : str
    sources_dim_labels = {
        <source> : [str]  # length of list = to number of dimensions of source
    }
    name : str

    Returns
    -------
    h5py_file_path
    """
    previous_end_idx = 0
    source_to_list_of_subsets = {}
    split_dict = {}
    for subset_name in nested_dataset_dict.keys():
        if subset_name not in split_dict:
                split_dict[subset_name] = {}
        n_samples = nested_dataset_dict[subset_name].values()[0].shape[0]
        for source_name in nested_dataset_dict[subset_name].keys():
            assert nested_dataset_dict[
                subset_name
            ][source_name].shape[0] == n_samples
            split_dict[subset_name][source_name] = (
                previous_end_idx,
                previous_end_idx + n_samples
            )
            if source_name not in source_to_list_of_subsets:
                source_to_list_of_subsets[source_name] = [
                    nested_dataset_dict[subset_name][source_name]
                ]
            else:
                source_to_list_of_subsets[source_name].append(
                    nested_dataset_dict[subset_name][source_name]
                )
        previous_end_idx += n_samples

    concatenated_subsets = {}
    for source_name in source_to_list_of_subsets.keys():
        concatenated_subsets[source_name] = np.concatenate(
            source_to_list_of_subsets[source_name],
            axis=0
        )

    def write_one_source(
        source_name,
        source_data,
        source_dim_labels,
        h5py_file
    ):
        """ Writes the content for one source to the passed H5PY File.

        Parameters
        ----------
        source_name : str
        source_data : ndarray(shape=S)
        source_dim_labels : [str]
            len(source_dim_labels) = len(S)
        h5py_file : h5py.File
        """
        source_handle = h5py_file.create_dataset(
            source_name,
            source_data.shape,
            dtype=dtype
        )
        source_handle[...] = source_data
        for dim, label in zip(source_handle.dims, source_dim_labels):
            dim.label = label

    with h5py.File(file_path, mode='w') as f:
        for source_name in concatenated_subsets.keys():
            write_one_source(
                source_name=source_name,
                source_data=concatenated_subsets[source_name],
                source_dim_labels=sources_dim_labels[source_name],
                h5py_file=f
            )
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.flush()
        f.close()

コード例 #45

0

ファイルを表示

ファイル: generate_featurelevel_ucf101.py プロジェクト: ChunHungLiu/tsa-rnn

                    logger.warning("missing %s" % identifier)

                target = int(classmap[os.path.dirname(identifier)])
                lists[which_set].setdefault("targets", []).append(target)
                for i, source in enumerate(sources):
                    compressed = np.fromstring(
                        # level 1 compression is fast and takes out ~90%
                        zlib.compress(cPickle.dumps(data[i]), 1),
                        dtype=np.uint8)
                    #assert np.array_equal(cPickle.loads(zlib.decompress(compressed)), data[i])
                    print source, data[i].shape, len(compressed)
                    lists[which_set].setdefault(source, []).append(compressed)

                k += 1
                print "%i/%i" % (k, n)

    split_dict = OrderedDict()
    b = 0
    for which_set, set_by_source in lists.items():
        a = b
        b += len(set_by_source["targets"])
        h5file["targets"][a:b] = set_by_source["targets"]
        for source in sources:
            h5file[source][a:b] = set_by_source[source]
        split_dict[which_set] = OrderedDict([
            (source, (a, b)) for source in ["targets"] + sources
        ])
    h5file.attrs["split"] = H5PYDataset.create_split_array(split_dict)

    h5file.flush()
    h5file.close()

コード例 #46

0

ファイルを表示

def main():
    
    net = vaegan.VAEGAN()
    network_saver = saver.NetworkSaver('vaegan/models/', net=net)
    network_saver.load()
    
    img_data = np.load('img_data.npy', mmap_mode='r')
    data_out = np.load('data_out.npy', mmap_mode='r')
    data_in = np.load('data_in.npy', mmap_mode='r')
    
    data_in = np.array(data_in)
    data_out = np.array(data_out)
    
    train_data_size = data_in.shape[0] * train_size
    
    print type(img_data), img_data.shape
    print type(data_in), data_in.shape
    print type(data_out), data_out.shape
    
    data_in, in_size = getConvFeatures(net, data_in, img_data)
    
    out_size = len(output_columns)
    max_prediction = max(future_predictions) + 1
    if len(data_in) % seq_length > 0:
        data_in = data_in[:len(data_in) - len(data_in) % seq_length + max_prediction]
    else:
        data_in = data_in[:len(data_in) - seq_length + max_prediction]
    nsamples = (len(data_in) / seq_redundancy)
    print 'Saving data to disc...'
    inputs = np.memmap('inputs.npy', dtype=theano.config.floatX, mode='w+', shape=(nsamples, seq_length, in_size))
    outputs = np.memmap('outputs.npy', dtype=theano.config.floatX, mode='w+',
                        shape=(nsamples, seq_length, len(future_predictions) * out_size))
    
    for i, p in enumerate(xrange(0, len(data_in) - max_prediction - seq_length, seq_redundancy)):
        inputs[i] = np.array([d for d in data_in[p:p + seq_length]])
        for j in xrange(len(future_predictions)):
            outputs[i, :, j * out_size:(j + 1) * out_size] = np.array(
                [d for d in data_out[p + future_predictions[j]:p + seq_length + future_predictions[j]]])

    nsamples = len(inputs)
    nsamples_train = train_data_size // seq_length

    print np.isnan(np.sum(inputs))
    print np.isnan(np.sum(outputs))

    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('features', inputs.shape, dtype=theano.config.floatX)
    targets = f.create_dataset('targets', outputs.shape, dtype=theano.config.floatX)

    features[...] = inputs
    targets[...] = outputs
    features.dims[0].label = 'batch'
    features.dims[1].label = 'sequence'
    features.dims[2].label = 'features'
    targets.dims[0].label = 'batch'
    targets.dims[1].label = 'sequence'
    targets.dims[2].label = 'outputs'
    split_dict = {
        'train': {'features': (0, nsamples_train), 'targets': (0, nsamples_train)},
        'test': {'features': (nsamples_train, nsamples), 'targets': (nsamples_train, nsamples)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    
    print nsamples_train, nsamples, train_data_size, seq_length
    print 'inputs shape:', inputs.shape
    print 'outputs shape:', outputs.shape
    print 'image inputs shape:', img_data.shape
    os.remove(inputs.filename)
    os.remove(outputs.filename)
    print 'Files saved on disc for Blocks!'

コード例 #47

0

ファイルを表示

ファイル: svhn.py プロジェクト: harmdevries89/fuel

def convert_svhn_format_1(directory, output_directory,
                          output_filename='svhn_format_1.hdf5'):
    """Converts the SVHN dataset (format 1) to HDF5.

    This method assumes the existence of the files
    `{train,test,extra}.tar.gz`, which are accessible through the
    official website [SVHNSITE].

    .. [SVHNSITE] http://ufldl.stanford.edu/housenumbers/

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'svhn_format_1.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    try:
        output_path = os.path.join(output_directory, output_filename)
        h5file = h5py.File(output_path, mode='w')
        TMPDIR = tempfile.mkdtemp()

        # Every image has three channels (RGB) and variable height and width.
        # It features a variable number of bounding boxes that identify the
        # location and label of digits. The bounding box location is specified
        # using the x and y coordinates of its top left corner along with its
        # width and height.
        BoundingBoxes = namedtuple(
            'BoundingBoxes', ['labels', 'heights', 'widths', 'lefts', 'tops'])
        sources = ('features',) + tuple('bbox_{}'.format(field)
                                        for field in BoundingBoxes._fields)
        source_dtypes = dict([(source, 'uint8') for source in sources[:2]] +
                             [(source, 'uint16') for source in sources[2:]])
        source_axis_labels = {
            'features': ('channel', 'height', 'width'),
            'bbox_labels': ('bounding_box', 'index'),
            'bbox_heights': ('bounding_box', 'height'),
            'bbox_widths': ('bounding_box', 'width'),
            'bbox_lefts': ('bounding_box', 'x'),
            'bbox_tops': ('bounding_box', 'y')}

        # The dataset is split into three sets: the training set, the test set
        # and an extra set of examples that are somewhat less difficult but
        # can be used as extra training data. These sets are stored separately
        # as 'train.tar.gz', 'test.tar.gz' and 'extra.tar.gz'. Each file
        # contains a directory named after the split it stores. The examples
        # are stored in that directory as PNG images. The directory also
        # contains a 'digitStruct.mat' file with all the bounding box and
        # label information.
        splits = ('train', 'test', 'extra')
        file_paths = dict(zip(splits, FORMAT_1_FILES))
        for split, path in file_paths.items():
            file_paths[split] = os.path.join(directory, path)
        digit_struct_paths = dict(
            [(split, os.path.join(TMPDIR, split, 'digitStruct.mat'))
             for split in splits])

        # We first extract the data files in a temporary directory. While doing
        # that, we also count the number of examples for each split. Files are
        # extracted individually, which allows to display a progress bar. Since
        # the splits will be concatenated in the HDF5 file, we also compute the
        # start and stop intervals of each split within the concatenated array.
        def extract_tar(split):
            with tarfile.open(file_paths[split], 'r:gz') as f:
                members = f.getmembers()
                num_examples = sum(1 for m in members if '.png' in m.name)
                progress_bar_context = progress_bar(
                    name='{} file'.format(split), maxval=len(members),
                    prefix='Extracting')
                with progress_bar_context as bar:
                    for i, member in enumerate(members):
                        f.extract(member, path=TMPDIR)
                        bar.update(i)
            return num_examples

        examples_per_split = OrderedDict(
            [(split, extract_tar(split)) for split in splits])
        cumulative_num_examples = numpy.cumsum(
            [0] + list(examples_per_split.values()))
        num_examples = cumulative_num_examples[-1]
        intervals = zip(cumulative_num_examples[:-1],
                        cumulative_num_examples[1:])
        split_intervals = dict(zip(splits, intervals))

        # The start and stop indices are used to create a split dict that will
        # be parsed into the split array required by the H5PYDataset interface.
        # The split dict is organized as follows:
        #
        #     dict(split -> dict(source -> (start, stop)))
        #
        split_dict = OrderedDict([
            (split, OrderedDict([(s, split_intervals[split])
                                 for s in sources]))
            for split in splits])
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        # We then prepare the HDF5 dataset. This involves creating datasets to
        # store data sources and datasets to store auxiliary information
        # (namely the shapes for variable-length axes, and labels to indicate
        # what these variable-length axes represent).
        def make_vlen_dataset(source):
            # Create a variable-length 1D dataset
            dtype = h5py.special_dtype(vlen=numpy.dtype(source_dtypes[source]))
            dataset = h5file.create_dataset(
                source, (num_examples,), dtype=dtype)
            # Create a dataset to store variable-length shapes.
            axis_labels = source_axis_labels[source]
            dataset_shapes = h5file.create_dataset(
                '{}_shapes'.format(source), (num_examples, len(axis_labels)),
                dtype='uint16')
            # Create a dataset to store labels for variable-length axes.
            dataset_vlen_axis_labels = h5file.create_dataset(
                '{}_vlen_axis_labels'.format(source), (len(axis_labels),),
                dtype='S{}'.format(
                    numpy.max([len(label) for label in axis_labels])))
            # Fill variable-length axis labels
            dataset_vlen_axis_labels[...] = [
                label.encode('utf8') for label in axis_labels]
            # Attach auxiliary datasets as dimension scales of the
            # variable-length 1D dataset. This is in accordance with the
            # H5PYDataset interface.
            dataset.dims.create_scale(dataset_shapes, 'shapes')
            dataset.dims[0].attach_scale(dataset_shapes)
            dataset.dims.create_scale(dataset_vlen_axis_labels, 'shape_labels')
            dataset.dims[0].attach_scale(dataset_vlen_axis_labels)
            # Tag fixed-length axis with its label
            dataset.dims[0].label = 'batch'

        for source in sources:
            make_vlen_dataset(source)

        # The "fun" part begins: we extract the bounding box and label
        # information contained in 'digitStruct.mat'. This is a version 7.3
        # Matlab file, which uses HDF5 under the hood, albeit with a very
        # convoluted layout.
        def get_boxes(split):
            boxes = []
            with h5py.File(digit_struct_paths[split], 'r') as f:
                bar_name = '{} digitStruct'.format(split)
                bar_maxval = examples_per_split[split]
                with progress_bar(bar_name, bar_maxval) as bar:
                    for image_number in range(examples_per_split[split]):
                        # The 'digitStruct' group is the main group of the HDF5
                        # file. It contains two datasets: 'bbox' and 'name'.
                        # The 'name' dataset isn't of interest to us, as it
                        # stores file names and there's already a one-to-one
                        # mapping between row numbers and image names (e.g.
                        # row 0 corresponds to '1.png', row 1 corresponds to
                        # '2.png', and so on).
                        main_group = f['digitStruct']
                        # The 'bbox' dataset contains the bounding box and
                        # label information we're after. It has as many rows
                        # as there are images, and one column. Elements of the
                        # 'bbox' dataset are object references that point to
                        # (yet another) group that contains the information
                        # for the corresponding image.
                        image_reference = main_group['bbox'][image_number, 0]

                        # There are five datasets contained in that group:
                        # 'label', 'height', 'width', 'left' and 'top'. Each of
                        # those datasets has as many rows as there are bounding
                        # boxes in the corresponding image, and one column.
                        def get_dataset(name):
                            return main_group[image_reference][name][:, 0]
                        names = ('label', 'height', 'width', 'left', 'top')
                        datasets = dict(
                            [(name, get_dataset(name)) for name in names])

                        # If there is only one bounding box, the information is
                        # stored directly in the datasets. If there are
                        # multiple bounding boxes, elements of those datasets
                        # are object references pointing to 1x1 datasets that
                        # store the information (fortunately, it's the last
                        # hop we need to make).
                        def get_elements(dataset):
                            if len(dataset) > 1:
                                return [int(main_group[reference][0, 0])
                                        for reference in dataset]
                            else:
                                return [int(dataset[0])]
                        # Names are pluralized in the BoundingBox named tuple.
                        kwargs = dict(
                            [(name + 's', get_elements(dataset))
                             for name, dataset in iteritems(datasets)])
                        boxes.append(BoundingBoxes(**kwargs))
                        if bar:
                            bar.update(image_number)
            return boxes

        split_boxes = dict([(split, get_boxes(split)) for split in splits])

        # The final step is to fill the HDF5 file.
        def fill_split(split, bar=None):
            for image_number in range(examples_per_split[split]):
                image_path = os.path.join(
                    TMPDIR, split, '{}.png'.format(image_number + 1))
                image = numpy.asarray(
                    Image.open(image_path)).transpose(2, 0, 1)
                bounding_boxes = split_boxes[split][image_number]
                num_boxes = len(bounding_boxes.labels)
                index = image_number + split_intervals[split][0]

                h5file['features'][index] = image.flatten()
                h5file['features'].dims[0]['shapes'][index] = image.shape
                for field in BoundingBoxes._fields:
                    name = 'bbox_{}'.format(field)
                    h5file[name][index] = getattr(bounding_boxes, field)
                    h5file[name].dims[0]['shapes'][index] = [num_boxes, 1]

                # Replace label '10' with '0'.
                labels = h5file['bbox_labels'][index]
                labels[labels == 10] = 0
                h5file['bbox_labels'][index] = labels

                if image_number % 1000 == 0:
                    h5file.flush()
                if bar:
                    bar.update(index)

        with progress_bar('SVHN format 1', num_examples) as bar:
            for split in splits:
                fill_split(split, bar=bar)
    finally:
        if os.path.isdir(TMPDIR):
            shutil.rmtree(TMPDIR)
        h5file.flush()
        h5file.close()

    return (output_path,)

コード例 #48

0

ファイルを表示

ファイル: make_dataset.py プロジェクト: teganmaharaj/deeplearningclass

data_to_use = numpy.asarray(data_to_use)
input_array = data_to_use[:-frame_length].reshape(num_examples, seq_length, frame_length)
target_array = data_to_use[frame_length:].reshape(num_examples, seq_length, frame_length)
print input_array.shape
print target_array.shape

# Make H5PY file
print "\nMaking Fuel-formatted HDF5 file..."
f = h5py.File(hdf5_file, mode="w")
inputs = f.create_dataset("inputs", input_array.shape, dtype="float64")
targets = f.create_dataset("targets", target_array.shape, dtype="float64")
inputs[...] = input_array
targets[...] = target_array
inputs.dims[0].label = "batch"
inputs.dims[1].label = "sequence"
targets.dims[0].label = "batch"
targets.dims[1].label = "sequence"

# Split in test:train
print "doing train:test split (at " + str(train_samples) + ")"
num_train_examples = train_samples // example_length
split_dict = {
    "train": {"inputs": (0, num_train_examples), "targets": (0, num_train_examples)},
    "dev": {"inputs": (num_train_examples, num_examples), "targets": (num_train_examples, num_examples)},
}

f.attrs["split"] = H5PYDataset.create_split_array(split_dict)
f.flush()
f.close()
print "file should be made"