Esempio n. 1
0
 def read_csv(self,
              filename,
              dtype=float,
              delimiter=",",
              has_header=False,
              num_workers=4):
     file_size = storage_utils.get_file_size(filename)
     file_batches: storage_utils.Batch = storage_utils.Batch.from_num_batches(
         file_size, num_workers)
     blocks = []
     shape_oids = []
     for i, batch in enumerate(file_batches.batches):
         file_start, file_end = batch
         block_oid, shape_oid = self.cm.call(
             "read_csv_block",
             filename,
             file_start,
             file_end,
             dtype,
             delimiter,
             has_header,
             syskwargs={
                 "grid_entry": (i, ),
                 "grid_shape": (num_workers, ),
                 "options": {
                     "num_returns": 2
                 },
             },
         )
         blocks.append(block_oid)
         shape_oids.append(shape_oid)
     shapes = self.cm.get(shape_oids)
     arrays = []
     for i in range(len(shapes)):
         shape = shapes[i]
         if shape[0] == 0:
             continue
         block = blocks[i]
         grid = ArrayGrid(shape=shape,
                          block_shape=shape,
                          dtype=dtype.__name__)
         arr = BlockArray(grid, self.cm)
         iter_one = True
         for grid_entry in grid.get_entry_iterator():
             assert iter_one
             iter_one = False
             arr.blocks[grid_entry].oid = block
         arrays.append(arr)
     return arrays
Esempio n. 2
0
 def loadtxt(self, fname, dtype=float, comments='# ', delimiter=' ',
             converters=None, skiprows=0, usecols=None, unpack=False,
             ndmin=0, encoding='bytes', max_rows=None, num_workers=4) -> BlockArray:
     # pylint: disable=unused-variable
     bytes_per_char, bytes_per_row, bytes_per_col, num_cols = storage_utils.get_np_txt_info(
         fname, comments, delimiter
     )
     chars_per_row = bytes_per_row // bytes_per_char
     assert np.allclose(float(chars_per_row), bytes_per_row / bytes_per_char)
     comment_lines, trailing_newlines = storage_utils.get_np_comments(fname, comments)
     nonrow_chars = trailing_newlines
     for line in comment_lines:
         nonrow_chars += len(line)
     file_size = storage_utils.get_file_size(fname)
     file_chars = file_size // bytes_per_char
     assert np.allclose(float(file_chars), file_size / bytes_per_char)
     row_chars = file_chars - nonrow_chars
     num_rows = row_chars // chars_per_row
     assert np.allclose(float(num_rows), float(row_chars / chars_per_row))
     num_rows_final = num_rows - skiprows
     if max_rows is not None:
         num_rows_final = (num_rows_final, max_rows)
     row_batches: storage_utils.Batch = storage_utils.Batch.from_num_batches(num_rows_final,
                                                                             num_workers)
     grid = ArrayGrid(shape=(num_rows_final, num_cols),
                      block_shape=(row_batches.batch_size, num_cols),
                      dtype=np.float64.__name__ if dtype is float else dtype.__name__)
     result: BlockArray = BlockArray(grid, system=self.system)
     for i, grid_entry in enumerate(grid.get_entry_iterator()):
         row_start, row_end = row_batches.batches[i]
         batch_skiprows = skiprows + row_start + 1
         batch_max_rows = grid.get_block_shape(grid_entry)[0]
         assert batch_max_rows == row_end - row_start
         result.blocks[grid_entry].oid = self.loadtxt_block(
             fname, dtype=dtype, comments=comments, delimiter=delimiter,
             converters=converters, skiprows=batch_skiprows,
             usecols=usecols, unpack=unpack, ndmin=ndmin,
             encoding=encoding, max_rows=batch_max_rows,
             syskwargs={
                 "grid_entry": grid_entry,
                 "grid_shape": grid.grid_shape
             }
         )
     return result