Esempio n. 1
0
def genAmatrixH(nk, cnk, p_in, p_out, K, filename):
    n = nk[K - 1] + cnk[K - 1]
    f = tb.open_file(filename, 'w')
    filters = tb.Filters(complevel=5, complib='blosc')
    out_indices = f.create_earray(f.root,
                                  'indices',
                                  tb.Int32Atom(),
                                  shape=(0, ),
                                  filters=filters)
    out_indptr = f.create_carray(f.root,
                                 'indptr',
                                 tb.Int32Atom(),
                                 shape=(n + 1, ),
                                 filters=filters)
    out_indptr[0] = 0

    for k in range(0, K):
        for i in range(0, nk[k]):
            con = rand(1, nk[k] - i - 1, density=p_in, format='csr')
            con.indices[:] = con.indices[:] + cnk[k] + i + 1
            out_indices.append(con.indices)
            out_indptr[i + cnk[k] + 1] = out_indptr[i + cnk[k]] + con.getnnz()
            for j in range(k + 1, K):
                con = rand(1, nk[j], density=p_out, format='csr')
                con.indices[:] = con.indices[:] + cnk[j]
                out_indices.append(con.indices)
                out_indptr[i + cnk[k] + 1] += con.getnnz()
    f.close()
Esempio n. 2
0
def create_hdf5_file(fn, X_shape, y_shape, num_labels):
    h5file = tables.openFile(fn, mode="w", title="Dataset")
    filters = tables.Filters(complib='blosc', complevel=5)
    gcolumns = h5file.createGroup(h5file.root, "Data", "Data")
    h5file.createCArray(gcolumns,
                        'X',
                        atom=tables.Int32Atom(),
                        shape=X_shape,
                        title="Data_X",
                        filters=filters)
    h5file.createCArray(gcolumns,
                        'y',
                        atom=tables.Int32Atom(),
                        shape=y_shape,
                        title="Data_y",
                        filters=filters)
    h5file.createCArray(gcolumns,
                        'num_labels',
                        atom=tables.Int32Atom(),
                        shape=(1, ),
                        title="num_labels",
                        filters=filters)
    node = h5file.getNode('/', 'Data')
    node.num_labels[0] = num_labels
    h5file.flush()
    return h5file
Esempio n. 3
0
def savemat(X, filepath):
    X = ss.csc_matrix(X)
    with tb.open_file(filepath, 'w') as f:
        filters = tb.Filters(complevel=5, complib='blosc')
        out_data = f.create_earray(f.root,
                                   'data',
                                   tb.Float32Atom(),
                                   shape=(0, ),
                                   filters=filters)
        out_indices = f.create_earray(f.root,
                                      'indices',
                                      tb.Int32Atom(),
                                      shape=(0, ),
                                      filters=filters)
        out_indptr = f.create_earray(f.root,
                                     'indptr',
                                     tb.Int32Atom(),
                                     shape=(0, ),
                                     filters=filters)
        out_shape = f.create_earray(f.root,
                                    'shape',
                                    tb.Int32Atom(),
                                    shape=(0, ),
                                    filters=filters)

        out_data.append(X.data)
        out_indices.append(X.indices)
        out_indptr.append(X.indptr)
        out_shape.append(np.array([X.shape[0], X.shape[1]]))
Esempio n. 4
0
def write_h5f_csr(h5f, h5fplace, name, atom, csr_mat):
    write_h5f_array(h5f, h5fplace, name + '_data', atom, csr_mat.data)
    write_h5f_array(h5f, h5fplace, name + '_indices', tb.Int32Atom(),
                    csr_mat.indices)
    write_h5f_array(h5f, h5fplace, name + '_indptr', tb.Int32Atom(),
                    csr_mat.indptr)
    write_h5f_array(h5f, h5fplace, name + '_shape', tb.Int32Atom(),
                    np.array(csr_mat.shape))
Esempio n. 5
0
def hist_writer(file,
                *,
                group_name: 'options: HIST, HIST2D',
                table_name: 'options: pmt, pmtMAU, sipm, sipmMAU',
                compression='ZLIB4',
                n_sensors: 'number of pmts or sipms',
                bin_centres: 'np.array of bin centres'):
    try:
        hist_group = getattr(file.root, group_name)
    except tb.NoSuchNodeError:
        hist_group = file.create_group(file.root, group_name)

    n_bins = len(bin_centres)

    hist_table = file.create_earray(hist_group,
                                    table_name,
                                    atom=tb.Int32Atom(),
                                    shape=(0, n_sensors, n_bins),
                                    filters=tbl.filters(compression))

    ## The bins can be written just once at definition of the writer
    file.create_array(hist_group, table_name + '_bins', bin_centres)

    def write_hist(histo: 'np.array of histograms, one for each sensor'):
        hist_table.append(histo.reshape(1, n_sensors, n_bins))

    return write_hist
Esempio n. 6
0
    def __init__(self,
                 shape=None,
                 name="skin_out",
                 format="h5",
                 folder="./../data/"):
        self.name = name
        self.format = format
        self.folder = folder
        self.shape = shape

        # name_base = ''.join([char for char in self.name if not char.isdigit()])
        # num = ''.join([char for char in self.name if char.isdigit()])
        # if len(num) == 0:
        #     num = 0
        # else:
        #     num = int(num)
        # self.filename = "{}{}.{}".format(self.folder, name_base + str(num), self.format)

        self.filename = "{}{}.{}".format(self.folder, self.name, self.format)

        num = 0
        while file_exists(self.filename):
            self.filename = "{}{}-({}).{}".format(self.folder, self.name, num,
                                                  self.format)
            num += 1
        self.file = tables.open_file(self.filename, mode='w')
        self.file.create_earray(self.file.root, 'data', tables.Int32Atom(),
                                self.shape)
Esempio n. 7
0
def save_codenn_series(series: Iterable[str],
                       word2code: Dict[str, int],
                       file_path: str,
                       separator: str = '|') -> None:
    """Save series into file using CODEnn hdf5 format.

    Args:
        series (iterable of str): series of `sep` separated string.
        word2code (dict of str to int): word-to-code mapper.
        file_path (str): path to the output file.
        separator (str): separator to separate string into words.
    """
    with tables.open_file(file_path, mode='w') as h5f:
        table = h5f.create_table('/', 'indices', {
            'length': tables.UInt32Col(),
            'pos': tables.UInt32Col()
        }, 'a table of indices and lengths')
        array = h5f.create_earray('/', 'phrases', tables.Int32Atom(), (0, ))
        array.flavor = 'numpy'
        pos = 0
        for item in series:
            item = item.split(separator)
            length = len(item)
            index = table.row
            index['length'] = length
            index['pos'] = pos
            index.append()
            array.append(convert_words_to_codes(item, word2code))
            pos += length
Esempio n. 8
0
 def create_VLIntArray(self, name, array, group):
     """Stores a homogenous variable length integer array in a group"""
     self.h5file.create_vlarray(group,
                                 name,
                                 tables.Int32Atom(),
                                 "ragged array of ints",
                                 chunkshape = 512)
def open_h5_files() -> (list, list):

    float_atom = tables.Float32Atom()
    int_atom = tables.Int32Atom()

    fd_m = tables.open_file(os.path.join(MATRIX_DATASET_FOLDER, "all.h5"),
                            mode="w")
    data_m = fd_m.create_earray(fd_m.root,
                                "data",
                                float_atom,
                                (0, MATRIX_DIMENSION, MATRIX_DIMENSION),
                                expectedrows=600000)
    label_m = fd_m.create_earray(fd_m.root,
                                 "labels",
                                 int_atom, (0, 1),
                                 expectedrows=600000)

    fd_t = tables.open_file(os.path.join(TENSOR_DATASET_FOLDER, "all.h5"),
                            mode="w")

    data_t = fd_t.create_earray(
        fd_t.root,
        "data",
        float_atom, (0, TENSOR_DIMENSION, TENSOR_DIMENSION, TENSOR_DIMENSION),
        expectedrows=600000)
    label_t = fd_t.create_earray(fd_t.root,
                                 "labels",
                                 int_atom, (0, 1),
                                 expectedrows=600000)

    fd_m_test = tables.open_file(os.path.join(MATRIX_DATASET_FOLDER,
                                              "all_test.h5"),
                                 mode="w")
    data_m_test = fd_m.create_earray(fd_m_test.root,
                                     "data",
                                     float_atom,
                                     (0, MATRIX_DIMENSION, MATRIX_DIMENSION),
                                     expectedrows=60000)
    label_m_test = fd_m.create_earray(fd_m_test.root,
                                      "labels",
                                      int_atom, (0, 1),
                                      expectedrows=60000)

    fd_t_test = tables.open_file(os.path.join(TENSOR_DATASET_FOLDER,
                                              "all_test.h5"),
                                 mode="w")
    data_t_test = fd_m.create_earray(
        fd_t_test.root,
        "data",
        float_atom, (0, TENSOR_DIMENSION, TENSOR_DIMENSION, TENSOR_DIMENSION),
        expectedrows=60000)
    label_t_test = fd_m.create_earray(fd_t_test.root,
                                      "labels",
                                      int_atom, (0, 1),
                                      expectedrows=60000)

    return (fd_m,data_m,label_m),\
       (fd_t,data_t,label_t),\
       (fd_m_test,data_m_test,label_m_test),\
       (fd_t_test,data_t_test,label_t_test)
Esempio n. 10
0
def _make_int_vlarray(h5file: tables.File, name: str,
                      attribute: np.ndarray) -> None:
    vlarray = h5file.create_vlarray(h5file.root,
                                    name=name,
                                    atom=tables.Int32Atom(shape=()))
    for a in attribute:
        vlarray.append(a)
Esempio n. 11
0
def hdf5_save(matrix, filename, dtype=np.dtype(np.float64)):
    '''
    Helper function for storing scipy matrices as PyTables HDF5 matrices
    see http://www.philippsinger.info/?p=464 for further information
    :param matrix: matrix to store
    :param filename: filename for storage
    :param dtype: dtype
    :return: True
    '''

    #print matrix.shape

    atom = tb.Atom.from_dtype(dtype)

    f = tb.open_file(filename, 'w')

    #print "saving data"
    filters = tb.Filters(complevel=5, complib='blosc')
    out = f.create_carray(f.root,
                          'data',
                          atom,
                          shape=matrix.data.shape,
                          filters=filters)
    out[:] = matrix.data

    #print "saving indices"
    out = f.create_carray(f.root,
                          'indices',
                          tb.Int32Atom(),
                          shape=matrix.indices.shape,
                          filters=filters)
    out[:] = matrix.indices

    #print "saving indptr"
    out = f.create_carray(f.root,
                          'indptr',
                          tb.Int32Atom(),
                          shape=matrix.indptr.shape,
                          filters=filters)
    out[:] = matrix.indptr

    #print "saving done"

    f.close()

    return
Esempio n. 12
0
def write_categorical(source: CategoricalArraySource,
                      hfile: tables.File,
                      n_workers: int,
                      batchrows: Optional[int] = None,
                      maps: Optional[np.ndarray] = None) -> None:
    transform = CategoryMapper(maps, source.missing) if maps else IdWorker()
    n_workers = n_workers if maps else 0
    _write_source(source, hfile, tables.Int32Atom(source.shape[-1]),
                  "categorical_data", transform, n_workers, batchrows)
Esempio n. 13
0
    def __openNextFile(self):
        self.fileName = self.__getNextFileName()
        self.file = tables.open_file(self.fileName, mode='w')
        self.arrays = {}

        self.arrays['I'] = self.file.create_vlarray(self.file.root,
                                                    'I',
                                                    tables.Int32Atom(shape=()),
                                                    'I',
                                                    filters=tables.Filters(1))
        self.arrays['Q'] = self.file.create_vlarray(self.file.root,
                                                    'Q',
                                                    tables.Int32Atom(shape=()),
                                                    'Q',
                                                    filters=tables.Filters(1))

        self.fileOpened = True
        self.nWrittenToFile = 0
def create_dataset(filename: str, coefficients: int):

    print("called create_dataset")
    print(filename)
    N = 256
    with tables.open_file(filename, "w") as hdf5file:

        # create array for the object
        hdf5file.create_earray(hdf5file.root,
                               "object_real",
                               tables.Float32Atom(),
                               shape=(0, N * N))

        # create array for the object phase
        hdf5file.create_earray(hdf5file.root,
                               "object_imag",
                               tables.Float32Atom(),
                               shape=(0, N * N))

        # create array for the image
        hdf5file.create_earray(hdf5file.root,
                               "diffraction_noise",
                               tables.Float32Atom(),
                               shape=(0, N * N))

        # create array for the image
        hdf5file.create_earray(hdf5file.root,
                               "diffraction_noisefree",
                               tables.Float32Atom(),
                               shape=(0, N * N))

        # scale
        hdf5file.create_earray(hdf5file.root,
                               "scale",
                               tables.Float32Atom(),
                               shape=(0, 1))

        # zernike coefficients
        hdf5file.create_earray(hdf5file.root,
                               "coefficients",
                               tables.Float32Atom(),
                               shape=(0, coefficients))

        hdf5file.create_earray(hdf5file.root,
                               "N",
                               tables.Int32Atom(),
                               shape=(0, 1))

        hdf5file.close()

    with tables.open_file(filename, mode='a') as hd5file:
        # save the dimmensions of the data
        hd5file.root.N.append(np.array([[N]]))
Esempio n. 15
0
def assign_array(db,name,a,verbose=1):
    if a.dtype==dtype('int32'):
        atom = tables.Int32Atom()
    elif a.dtype==dtype('int64'):
        atom = tables.Int64Atom()
    elif a.dtype==dtype('f') or a.dtype==dtype('d'):
        atom = tables.Float32Atom()
    else:
        raise Exception('unknown array type: %s'%a.dtype)
    if verbose: print "[writing",name,a.shape,atom,"]"
    node = db.createEArray(db.root,name,atom,shape=[0]+list(a.shape[1:]),filters=tables.Filters(9))
    node.append(a)
Esempio n. 16
0
    def _create_table_list(self, name, example):
        """
        Create a new table within the HDF file, where the tables shape and its
        datatype are determined by *example*.
        The modified version for creating table with appendList
        """
        type_map = {
            np.dtype(np.float64): tables.Float64Atom(),
            np.dtype(np.float32): tables.Float32Atom(),
            np.dtype(np.int): tables.Int64Atom(),
            np.dtype(np.int8): tables.Int8Atom(),
            np.dtype(np.uint8): tables.UInt8Atom(),
            np.dtype(np.int16): tables.Int16Atom(),
            np.dtype(np.uint16): tables.UInt16Atom(),
            np.dtype(np.int32): tables.Int32Atom(),
            np.dtype(np.uint32): tables.UInt32Atom(),
            np.dtype(np.bool): tables.BoolAtom(),
        }

        try:
            if type(example) == np.ndarray:
                h5type = type_map[example.dtype]
            elif type(example) == list and type(example[0]) == str:
                h5type = tables.VLStringAtom()
        except KeyError:
            raise TypeError("Don't know how to handle dtype '%s'" %
                            example.dtype)

        if type(example) == np.ndarray:
            h5dim = (0, ) + example.shape[1:]

            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)
            self.tables[name] = h5.create_earray(h5.root,
                                                 name,
                                                 h5type,
                                                 h5dim,
                                                 filters=filters)
        elif type(example) == list and type(example[0]) == str:
            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)
            self.tables[name] = h5.create_vlarray(h5.root,
                                                  name,
                                                  h5type,
                                                  filters=filters)
        self.types[name] = type(example)
Esempio n. 17
0
    def _create_table(self, name, example):
        """
        Create a new table within the HDF file, where the tables shape and its
        datatype are determined by *example*.
        """
        type_map = {
            np.dtype(np.float64): tables.Float64Atom(),
            np.dtype(np.float32): tables.Float32Atom(),
            np.dtype(np.int): tables.Int64Atom(),
            np.dtype(np.int8): tables.Int8Atom(),
            np.dtype(np.uint8): tables.UInt8Atom(),
            np.dtype(np.int16): tables.Int16Atom(),
            np.dtype(np.uint16): tables.UInt16Atom(),
            np.dtype(np.int32): tables.Int32Atom(),
            np.dtype(np.uint32): tables.UInt32Atom(),
            np.dtype(np.bool): tables.BoolAtom(),
        }

        try:
            if type(example) == np.ndarray:
                h5type = type_map[example.dtype]
            elif type(example) == str:
                h5type = tables.VLStringAtom()
        except KeyError:
            raise TypeError(
                "Could not create table %s because of unknown dtype '%s'" %
                (name, example.dtype))  #+ ", of name: " % example.shape)

        if type(example) == np.ndarray:
            h5dim = (0, ) + example.shape

            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)
            self.tables[name] = h5.create_earray(h5.root,
                                                 name,
                                                 h5type,
                                                 h5dim,
                                                 filters=filters)
        elif type(example) == str:
            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)
            self.tables[name] = h5.create_vlarray(h5.root,
                                                  name,
                                                  h5type,
                                                  filters=filters)
        self.types[name] = type(example)
Esempio n. 18
0
def create_tables(from_gdb, tab_name):
    track = from_gdb.create_track(tab_name)

    chrom_list = from_gdb.get_all_chromosomes()

    atom = tables.Int32Atom(dflt=POS_UNDEF)
    
    for chrom in chrom_list:        
        sys.stderr.write(" %s\n" % chrom.name)
        shape = (chrom.length, 3)
        carray = track.h5f.createCArray(track.h5f.root, chrom.name, atom, shape,
                                        filters=ZLIB_FILTER)

        #carray[:,:] = POS_UNDEF

    return track
Esempio n. 19
0
 def save(self, filepath):
     import tables
     fitNPBS = scipy.array(
         [self.NPoints, self.NBind, self.NFix, self.NSmooth])
     atom1 = tables.Float64Atom()
     atom2 = tables.Int32Atom()
     filters = tables.Filters(complevel=5, complib='zlib')
     h5f = tables.openFile(filepath, 'w')
     h5NWMap = h5f.createCArray(h5f.root,
                                'NWMap',
                                atom2,
                                self.NWMap.shape,
                                filters=filters)
     h5NWMap[:, :] = self.NWMap
     h5Nodes = h5f.createCArray(h5f.root,
                                'Nodes',
                                atom2,
                                self.Nodes.shape,
                                filters=filters)
     h5Nodes[:] = self.Nodes
     h5Weights = h5f.createCArray(h5f.root,
                                  'Weights',
                                  atom1,
                                  self.Weights.shape,
                                  filters=filters)
     h5Weights[:] = self.Weights
     invA = h5f.createCArray(h5f.root,
                             'invA',
                             atom1,
                             self.svd_invA.shape,
                             filters=filters)
     invA[:, :] = self.svd_invA
     b = h5f.createCArray(h5f.root,
                          'b',
                          atom1,
                          self.b.shape,
                          filters=filters)
     b[:] = self.b
     fixed = h5f.createCArray(h5f.root,
                              'fixed',
                              atom1,
                              self.fixed.shape,
                              filters=filters)
     fixed[:] = self.fixed
     npbs = h5f.createCArray(h5f.root, 'npbs', atom2, fitNPBS.shape)
     npbs[:] = fitNPBS
     h5f.close()
Esempio n. 20
0
    def _create_table(self, name, example, parent=None):
        """
        Create a new table within the HDF file, where the tables shape and its
        datatype are determined by *example*.
        """
        h5 = self.h5
        filters = tables.Filters(complevel=self.compression_level,
                                 complib='zlib',
                                 shuffle=True)
        if parent is None:
            parent = h5.root

        if type(example) == str:
            h5type = tables.VLStringAtom()
            h5.createVLArray(parent, name, h5type, filters=filters)
            return
        if type(example) == dict:
            self.h5.createGroup(parent, name)
            return
        #If we get here then we're dealing with numpy arrays
        example = np.asarray(example)

        #MODIFICATION: appended name everywhere and introduced string
        type_map = {
            np.dtype(np.float64).name: tables.Float64Atom(),
            np.dtype(np.float32).name: tables.Float32Atom(),
            np.dtype(np.int).name: tables.Int64Atom(),
            np.dtype(np.int8).name: tables.Int8Atom(),
            np.dtype(np.uint8).name: tables.UInt8Atom(),
            np.dtype(np.int16).name: tables.Int16Atom(),
            np.dtype(np.uint16).name: tables.UInt16Atom(),
            np.dtype(np.int32).name: tables.Int32Atom(),
            np.dtype(np.uint32).name: tables.UInt32Atom(),
            np.dtype(np.bool).name: tables.BoolAtom(),
            # Maximal string length of 128 per string - change if needed
            'string32': tables.StringAtom(128)
        }

        try:
            h5type = type_map[example.dtype.name]
            h5dim = (0, ) + example.shape
            h5.createEArray(parent, name, h5type, h5dim, filters=filters)
        except KeyError:
            raise TypeError("Don't know how to handle dtype '%s'" %
                            example.dtype)
Esempio n. 21
0
    def __init__(self,args):
        super().__init__(args)

        filename = args['mem_location']
        
        try:
            self._kill_any_open_file()
            load_into_mem = args['load_h5_into_mem'] if 'load_h5_into_mem' in args else False
            if load_into_mem:
                print('Loading h5 file into memory...')
                self.h5file = tb.open_file(filename, mode='a', driver="H5FD_CORE")
                print('Done!')
            else:
                self.h5file = tb.open_file(filename, mode='a')
            self.frame = self.h5file.get_node("/","frame")
            self.measurements = self.h5file.get_node("/","measurements")
            self.a_history = self.h5file.get_node("/","a_history")
            self.aidx = self.h5file.get_node("/","aidx")
            self.a_taken_prob = self.h5file.get_node("/","a_taken_prob")
            self.state_value = self.h5file.get_node("/","state_value")
            self.gae = self.h5file.get_node("/","gae")            

        except:
            print_exc()
            try:
                self.h5file.close()
            except:
                pass
            build_file = input('Unable to load H5 File. Would you like to build a new one? This will overwrite any existing file. (y/n): ')
            if build_file=='y':
                self.h5file = tb.open_file(filename, mode='w', title="Doom Replay Data")
                root = self.h5file.root 
                self.frame = self.h5file.create_vlarray(root,'frame',tb.Float32Atom())
                self.measurements = self.h5file.create_vlarray(root,'measurements',tb.Float32Atom())
                self.a_history = self.h5file.create_vlarray(root,'a_history',tb.Float32Atom())
                self.aidx = self.h5file.create_vlarray(root,'aidx',tb.Int32Atom())
                self.a_taken_prob = self.h5file.create_vlarray(root,'a_taken_prob',tb.Float32Atom())
                self.state_value = self.h5file.create_vlarray(root,'state_value',tb.Float32Atom())
                self.gae = self.h5file.create_vlarray(root,'gae',tb.Float32Atom())
                    
            else:
                raise ValueError("No H5 file loaded. Please load valid h5 file or create a new one")      
Esempio n. 22
0
def fetch_svhn_extra(source_paths, target_path):
    extra_path = source_paths[0]

    print('Converting {} to HDF5 (compressed)...'.format(extra_path))
    f_out = tables.open_file(target_path, mode='w')
    g_out = f_out.create_group(f_out.root, 'svhn', 'SVHN data')
    filters = tables.Filters(complevel=9, complib='blosc')
    X_u8_arr = f_out.create_earray(g_out,
                                   'extra_X_u8',
                                   tables.UInt8Atom(), (0, 3, 32, 32),
                                   filters=filters)
    y_arr = f_out.create_earray(g_out,
                                'extra_y',
                                tables.Int32Atom(), (0, ),
                                filters=filters)

    # Load in the extra data Matlab file
    _insert_svhn_matlab_to_h5(X_u8_arr, y_arr, extra_path)

    f_out.close()

    return target_path
Esempio n. 23
0
def create_sent_tokens_array():
    try:
        tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='w')
        atom = tables.StringAtom(itemsize=16)
        tokens_arr = tokens_file.create_earray(tokens_file.root, 'data', atom, (0, cnt.MAX_WORDS))
        vocab = set()
        
        n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE
        num_batches = int(math.ceil(float(n)/batch_size))

        for m in range(num_batches):
            start, end = m*batch_size, min((m+1)*batch_size, n)
            batch_items = [items[x] for x in range(start, end)]
            tokens = [gutils.padd_fn(gutils.get_tokens(gutils.get_item_text(item))) for item in batch_items]
            tokens_arr.append(tokens)
            vocab.update([x for token in tokens for x in token])
            
        vocab = sorted(list(vocab))
        word2idx_map = {w: i + 1 for i, w in enumerate(vocab)}
        gutils.save_data_pkl(word2idx_map, cnt.WORD2IDX_FILE)
        
        sent_tokens = tokens_file.root.data
        
        sents_arr_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_ARRAYS_FILE), mode='w')
        atom = tables.Int32Atom()
        sents_arr = sents_arr_file.create_earray(sents_arr_file.root, 'data', atom, (0, cnt.MAX_WORDS))
        
        n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE
        num_batches = int(math.ceil(float(n)/batch_size))
        
        for m in range(num_batches):
            start, end = m*batch_size, min((m+1)*batch_size, n)
            tokens = [sent_tokens[x] for x in range(start, end)]
            sent_arrs = [[gutils.word_to_idx(w, word2idx_map) for w in token] for token in tokens]
            sents_arr.append(sent_arrs)
        
    finally:
        tokens_file.close()
        sents_arr_file.close()
Esempio n. 24
0
def convert_corpus(corpusfile, outfile):
    """Loads the given [corpusfile] (which should be of a type loadable by Corpus) as a list of lists of sentence
    arrays (the output of Corpus.build_sentence_document_arrays) and saves it as an HDF5 file at [outfile].
    """
    c = Corpus(corpusfile)  #, ndocs=10000)
    print "Building document arrays.."
    docarrays, mvocab = c.build_sentence_document_arrays(vocab)

    print "Opening output file.."
    tf = tables.openFile(outfile, mode="w", title="converted_corpus")

    print "Saving document arrays.."
    darr = tf.createVLArray(tf.root, "docarrays", tables.Int32Atom(shape=()))
    for da in docarrays:
        darr.append(da)

    print "Saving vocabulary.."
    varr = tf.createVLArray(tf.root, "vocab",
                            tables.StringAtom(max(map(len, mvocab))))
    for w in mvocab:
        varr.append(w)

    print "Closing output file.."
    tf.close()
Esempio n. 25
0
root = r'/mnt/DATA/Prob_IR/'
context_dataset_name = r'context_data'
encoded_docs_filename = r'encoded_docs_model'
word_index_filename = r'word_index'
emb_filename = r'embeddings_dim_' + str(opt['dim']) + '_margin_' + str(
    opt['margin'])
emb_path = os.path.join(root, emb_filename)

context_dataset_path = os.path.join(root, context_dataset_name)
print("Loading data...")

_, words = load_dataset(root, encoded_docs_filename, word_index_filename)
idx_words = np.array(list(range(len(words))))

atom = tables.Int32Atom()

with tables.open_file(context_dataset_path, mode='r') as f:
    train_context = torch.Tensor(np.array(f.root.data[:]))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dim = 20
w = torch.zeros(len(words), dim + dim**2, device=device)
init.xavier_uniform_(w)
w.requires_grad = True

ntot = opt['num_positive'] + 1
opt['tot_batch_size'] = ntot * opt['batch_size']

dataset = TensorDataset(train_context)
loader = DataLoader(dataset=dataset,
Esempio n. 26
0
def preprocess(csv_files,
               batch_size,
               numcep,
               numcontext,
               alphabet,
               hdf5_cache_path=None):
    COLUMNS = ('features', 'features_len', 'transcript', 'transcript_len')

    print('Preprocessing', csv_files)

    if hdf5_cache_path and os.path.exists(hdf5_cache_path):
        with tables.open_file(hdf5_cache_path, 'r') as file:
            features = file.root.features[:]
            features_len = file.root.features_len[:]
            transcript = file.root.transcript[:]
            transcript_len = file.root.transcript_len[:]

            # features are stored flattened, so reshape into [n_steps, numcep]
            for i in range(len(features)):
                features[i].shape = [features_len[i] + 2 * numcontext, numcep]

            in_data = list(
                zip(features, features_len, transcript, transcript_len))
            print('Loaded from cache at', hdf5_cache_path)
            return pandas.DataFrame(data=in_data, columns=COLUMNS)

    source_data = None
    for csv in csv_files:
        file = pandas.read_csv(csv, encoding='utf-8', na_filter=False)
        #FIXME: not cross-platform
        csv_dir = os.path.dirname(os.path.abspath(csv))
        file['wav_filename'] = file['wav_filename'].str.replace(
            r'(^[^/])', lambda m: os.path.join(csv_dir, m.group(1)))
        if source_data is None:
            source_data = file
        else:
            source_data = source_data.append(file)

    step_fn = partial(process_single_file,
                      numcep=numcep,
                      numcontext=numcontext,
                      alphabet=alphabet)
    out_data = pmap(step_fn, source_data.iterrows())

    if hdf5_cache_path:
        print('Saving to', hdf5_cache_path)

        # list of tuples -> tuple of lists
        features, features_len, transcript, transcript_len = zip(*out_data)

        with tables.open_file(hdf5_cache_path, 'w') as file:
            features_dset = file.create_vlarray(
                file.root,
                'features',
                tables.Float32Atom(),
                filters=tables.Filters(complevel=1))
            # VLArray atoms need to be 1D, so flatten feature array
            for f in features:
                features_dset.append(np.reshape(f, -1))

            features_len_dset = file.create_array(file.root, 'features_len',
                                                  features_len)

            transcript_dset = file.create_vlarray(
                file.root,
                'transcript',
                tables.Int32Atom(),
                filters=tables.Filters(complevel=1))
            for t in transcript:
                transcript_dset.append(t)

            transcript_len_dset = file.create_array(file.root,
                                                    'transcript_len',
                                                    transcript_len)

    print('Preprocessing done')
    return pandas.DataFrame(data=out_data, columns=COLUMNS)
Esempio n. 27
0
"""Small example that shows how to work with variable length arrays of
different types, UNICODE strings and general Python objects included."""

from __future__ import print_function
import numpy as np
import tables
import pickle

# Open a new empty HDF5 file
fileh = tables.open_file("vlarray2.h5", mode="w")
# Get the root group
root = fileh.root

# A test with VL length arrays:
vlarray = fileh.create_vlarray(root, 'vlarray1', tables.Int32Atom(),
                               "ragged array of ints")
vlarray.append(np.array([5, 6]))
vlarray.append(np.array([5, 6, 7]))
vlarray.append([5, 6, 9, 8])

# Test with lists of bidimensional vectors
vlarray = fileh.create_vlarray(root, 'vlarray2', tables.Int64Atom(shape=(2,)),
                               "Ragged array of vectors")
a = np.array([[1, 2], [1, 2]], dtype=np.int64)
vlarray.append(a)
vlarray.append(np.array([[1, 2], [3, 4]], dtype=np.int64))
vlarray.append(np.zeros(dtype=np.int64, shape=(0, 2)))
vlarray.append(np.array([[5, 6]], dtype=np.int64))
# This makes an error (shape)
# vlarray.append(array([[5], [6]], dtype=int64))
Esempio n. 28
0
# t = f.create_table(f.root, 'table', recarray, "mdim recarray")

# a0 = f.create_array(f.root, 'field0', recarray['f0'], "mdim int32 array")
# a1 = f.create_array(f.root, 'field1', recarray['f1'], "mdim float64 array")

# c0 = f.create_carray(f.root, 'cfield0',
#                     tables.Int32Atom(), (2,2,2),
#                     "mdim int32 carray")
# c1 = f.create_carray(f.root, 'cfield1',
#                     tables.Float64Atom(), (2,3,3),
#                     "mdim float64 carray")

f1 = tables.open_file("chunkshape1.h5", mode="w")
c1 = f.create_carray(f1.root, 'cfield1',
                     tables.Int32Atom(), (L, N, M),
                     "scalar int32 carray", tables.Filters(complevel=0))

t1 = time()
c1[:] = numpy.empty(shape=(L, 1, 1), dtype="int32")
print("carray1 populate time:", time() - t1)
f1.close()


f2 = tables.open_file("chunkshape2.h5", mode="w")
c2 = f.create_carray(f2.root, 'cfield2',
                     tables.Int32Atom(), (L, M, N),
                     "scalar int32 carray", tables.Filters(complevel))

t1 = time()
c2[:] = numpy.empty(shape=(L, 1, 1), dtype="int32")
Esempio n. 29
0
    def _create_table_list(self, name, example):
        """
        Create a new table within the HDF file, where the tables shape and its
        datatype are determined by *example*.
        The modified version for creating table with appendList
        """
        type_map = {
            np.dtype(np.float64): tables.Float64Atom(),
            np.dtype(np.float32): tables.Float32Atom(),
            np.dtype(np.int): tables.Int64Atom(),
            np.dtype(np.int8): tables.Int8Atom(),
            np.dtype(np.uint8): tables.UInt8Atom(),
            np.dtype(np.int16): tables.Int16Atom(),
            np.dtype(np.uint16): tables.UInt16Atom(),
            np.dtype(np.int32): tables.Int32Atom(),
            np.dtype(np.uint32): tables.UInt32Atom(),
            np.dtype(np.bool): tables.BoolAtom(),
        }

        try:
            if type(example) == np.ndarray:
                h5type = type_map[example.dtype]
            elif type(example) == list and type(example[0]) == str:
                h5type = tables.VLStringAtom()
        except KeyError:
            raise TypeError("Don't know how to handle dtype '%s'" %
                            example.dtype)

        if type(example) == np.ndarray:
            h5dim = (0, ) + example.shape[1:]

            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)

            nodes = h5.list_nodes(h5.root)

            nmpt = name.replace('.', '/\n')
            nmpt = nmpt.split('\n')

            path = '/'
            for kay in range(len(nmpt) - 1):
                #if not path+nmpt[kay][:-1] in str(nodes): h5.create_group(path,nmpt[kay][:-1])
                try:
                    h5.is_visible_node(path + nmpt[kay][:-1])
                except:
                    h5.create_group(path, nmpt[kay][:-1])
                path += nmpt[kay]

            self.tables[name] = h5.create_earray(path,
                                                 nmpt[-1],
                                                 h5type,
                                                 h5dim,
                                                 filters=filters)

        elif type(example) == list and type(example[0]) == str:
            h5 = self.h5
            filters = tables.Filters(complevel=self.compression_level,
                                     complib='zlib',
                                     shuffle=True)

            nodes = h5.list_nodes(h5.root)

            nmpt = name.replace('.', '/\n')
            nmpt = nmpt.split('\n')

            path = '/'
            for kay in range(len(nmpt) - 1):
                #if not path+nmpt[kay][:-1] in str(nodes): h5.create_group(path,nmpt[kay][:-1])
                try:
                    h5.is_visible_node(path + nmpt[kay][:-1])
                except:
                    h5.create_group(path, nmpt[kay][:-1])
                path += nmpt[kay]

            self.tables[name] = h5.create_vlarray(path,
                                                  nmpt[-1],
                                                  h5type,
                                                  filters=filters)

        self.types[name] = type(example)
Esempio n. 30
0
    def _hdf5(self, alphabet_path, hdf5_path, ninput=26, ncontext=9):
        skipped = []
        str_to_label = {}
        alphabet_size = 0
        with codecs.open(alphabet_path, 'r', 'utf-8') as fin:
            for line in fin:
                if line[0:2] == '\\#':
                    line = '#\n'
                elif line[0] == '#':
                    continue
                str_to_label[line[:-1]] = alphabet_size
                alphabet_size += 1

        def process_sample(sample):
            if len(sample.transcript) == 0:
                skipped.append(sample.original_name)
                return None
            sample.write()
            try:
                samplerate, audio = wav.read(sample.file.filename)
            except:
                skipped.append(sample.original_name)
                return None
            features = mfcc(audio, samplerate=samplerate, numcep=ninput)[::2]
            empty_context = np.zeros((ncontext, ninput), dtype=features.dtype)
            features = np.concatenate((empty_context, features, empty_context))
            transcript = np.asarray(
                [str_to_label[c] for c in sample.transcript])
            if (2 * ncontext + len(features)) < len(transcript):
                skipped.append(sample.original_name)
                return None
            return features, len(features), transcript, len(transcript)

        out_data = self._map('Computing MFCC features...', self.samples,
                             process_sample)
        out_data = [s for s in out_data if s is not None]
        if len(skipped) > 0:
            log('WARNING - Skipped %d samples that had no transcription, had been too short for their transcription or had been missed:'
                % len(skipped))
            for s in skipped:
                log(' - Sample origin: "%s".' % s)
        if len(out_data) <= 0:
            log('No samples written to feature DB "%s".' % hdf5_path)
            return
        # list of tuples -> tuple of lists
        features, features_len, transcript, transcript_len = zip(*out_data)

        log('Writing feature DB...')
        with tables.open_file(hdf5_path, 'w') as file:
            features_dset = file.create_vlarray(
                file.root,
                'features',
                tables.Float32Atom(),
                filters=tables.Filters(complevel=1))
            # VLArray atoms need to be 1D, so flatten feature array
            for f in features:
                features_dset.append(np.reshape(f, -1))
            features_len_dset = file.create_array(file.root, 'features_len',
                                                  features_len)

            transcript_dset = file.create_vlarray(
                file.root,
                'transcript',
                tables.Int32Atom(),
                filters=tables.Filters(complevel=1))
            for t in transcript:
                transcript_dset.append(t)

            transcript_len_dset = file.create_array(file.root,
                                                    'transcript_len',
                                                    transcript_len)
        log('Wrote features of %d samples to feature DB "%s".' %
            (len(features), hdf5_path))