Exemple #1
0
  def _train_epoch(self, h5f):
    offset, size = 0, h5f["cols"].shape[0]
    pbar = aux.Progbar(size, stateful_metrics=["loss"])
    loss_nume, loss_deno = 0, 0
    while True:
      target = h5f["indptr"][offset] + self.opt.batch_size
      if target < size:
        next_offset = h5f["rows"][target]
      else:
        next_offset = h5f["indptr"].shape[0] - 1
      indptr = h5f["indptr"][offset:next_offset + 1]
      beg, end = indptr[0], indptr[-1]
      indptr -= beg
      cols = h5f["cols"][beg:end]
      offset = next_offset

      # call cuda kernel
      _loss_nume, _loss_deno = \
        self.obj.feed_data(cols, indptr.astype(np.int32))

      # accumulate loss
      loss_nume += _loss_nume
      loss_deno += _loss_deno
      loss = loss_nume / (loss_deno + EPS)

      # update progress bar
      pbar.update(end, values=[("loss", loss)])
      if end == size:
        break
Exemple #2
0
  def convert_bow_to_h5(self, filepath, h5_path):
    self.logger.info("convert bow %s to h5 %s", filepath, h5_path)
    num_docs, num_words, num_lines = \
      self.obj.read_bag_of_words_header(filepath)
    self.logger.info("number of docs: %d, words: %d, nnz: %d",
                     num_docs, num_words, num_lines)
    h5f = h5py.File(h5_path, "w")
    rows = h5f.create_dataset("rows", dtype=np.int64,
                              shape=(num_lines,), chunks=True)
    cols = h5f.create_dataset("cols", dtype=np.int32,
                              shape=(num_lines,), chunks=True)
    counts = h5f.create_dataset("counts", dtype=np.float32,
                                shape=(num_lines,), chunks=True)
    vali = h5f.create_dataset("vali", dtype=np.float32,
                              shape=(num_lines,), chunks=True)
    indptr = h5f.create_dataset("indptr", dtype=np.int64,
                                shape=(num_docs + 1,), chunks=True)
    indptr[0] = 0
    processed, recent_row, indptr_offset = 0, 0, 1
    pbar = aux.Progbar(num_lines, unit_name="line")
    while processed < num_lines:
      # get chunk size
      read_lines = min(num_lines - processed, self.opt.chunk_lines)

      # copy rows, cols, counts to h5
      _rows = np.empty((read_lines,), dtype=np.int64)
      _cols = np.empty((read_lines,), dtype=np.int32)
      _counts = np.empty((read_lines,), dtype=np.float32)
      self.obj.read_bag_of_words_content(_rows, _cols, _counts)
      rows[processed:processed + read_lines] = _rows
      cols[processed:processed + read_lines] = _cols
      counts[processed:processed + read_lines] = _counts
      vali[processed:processed + read_lines] = \
        np.random.uniform(size=(read_lines,)).astype(np.float32)

      # compute indptr
      prev_rows = np.zeros((read_lines,), dtype=np.int64)
      prev_rows[1:] = _rows[:-1]
      prev_rows[0] = recent_row
      diff = _rows - prev_rows
      indices = np.where(diff > 0)[0]
      _indptr = []
      for idx in indices:
        _indptr += ([processed + idx] * diff[idx])
      if _indptr:
        indptr[indptr_offset:indptr_offset + len(_indptr)] = \
          np.array(_indptr, dtype=np.int64)
        indptr_offset += len(_indptr)

      # udpate processed
      processed += read_lines
      pbar.update(processed)
      recent_row = _rows[-1]

    # finalize indptr
    _indptr = [num_lines] * (num_docs + 1 - indptr_offset)
    indptr[indptr_offset:num_docs + 1] = np.array(_indptr, dtype=np.int64)

    h5f.close()
Exemple #3
0
 def convert_stream_to_h5(self, filepath, min_count, out_dir,
                          chunk_indices=10000, seed=777):
   np.random.seed(seed)
   os.makedirs(out_dir, exist_ok=True)
   keys_path = pjoin(out_dir, "keys.txt")
   count_path = pjoin(out_dir, "count.txt")
   token_path = pjoin(out_dir, "token.h5")
   self.logger.info("save key, count, token to %s, %s, %s",
                    keys_path, count_path, token_path)
   self.load_stream_vocab(filepath, min_count, keys_path, count_path)
   full_num_lines = self.obj.load_stream_file(filepath)
   pbar = aux.Progbar(full_num_lines, unit_name="line")
   processed = 0
   h5f = h5py.File(token_path, "w")
   rows = h5f.create_dataset("rows", shape=(chunk_indices,),
                             maxshape=(None,), dtype=np.int64,
                             chunks=(chunk_indices,))
   cols = h5f.create_dataset("cols", shape=(chunk_indices,),
                             maxshape=(None,), dtype=np.int32,
                             chunks=(chunk_indices,))
   vali = h5f.create_dataset("vali", shape=(chunk_indices,),
                             maxshape=(None,), dtype=np.float32,
                             chunks=(chunk_indices,))
   indptr =  h5f.create_dataset("indptr", shape=(full_num_lines + 1,),
                                dtype=np.int64, chunks=True)
   processed, offset = 1, 0
   indptr[0] = 0
   while True:
     read_lines, data_size = self.obj.tokenize_stream(
       self.opt.chunk_lines, self.opt.num_threads)
     _rows = np.empty(shape=(data_size,), dtype=np.int32)
     _cols = np.empty(shape=(data_size,), dtype=np.int32)
     _indptr = np.empty(shape=(read_lines,), dtype=np.int32)
     self.obj.get_token(_rows, _cols, _indptr)
     rows.resize((offset + data_size,))
     rows[offset:offset + data_size] = \
       _rows.astype(np.int64) + (processed - 1)
     cols.resize((offset + data_size,))
     cols[offset:offset + data_size] = _cols
     vali.resize((offset + data_size,))
     vali[offset:offset + data_size] = \
       np.random.uniform(size=(data_size,)).astype(np.float32)
     indptr[processed:processed + read_lines] = \
       _indptr.astype(np.int64) + offset
     offset += data_size
     processed += read_lines
     pbar.update(processed - 1)
     if processed == full_num_lines + 1:
       break
   h5f.close()
Exemple #4
0
 def load_stream_vocab(self, filepath, min_count,
                       keys_path, count_path):
   full_num_lines = self.obj.load_stream_file(filepath)
   pbar = aux.Progbar(full_num_lines, unit_name="line",
                       stateful_metrics=["word_count"])
   processed = 0
   while True:
     read_lines, word_count = \
       self.obj.read_stream_for_vocab(
         self.opt.chunk_lines, self.opt.num_threads)
     processed += read_lines
     pbar.update(processed, values=[("word_count", word_count)])
     if processed == full_num_lines:
       break
   self.obj.get_word_vocab(min_count, keys_path, count_path)
Exemple #5
0
    def _train_e_step(self, h5f, gamma_h5f, epoch):
        offset, size = 0, h5f["cols"].shape[0]
        pbar = aux.Progbar(size, stateful_metrics=["train_loss", "vali_loss"])
        train_loss_nume, train_loss_deno = 0, 0
        vali_loss_nume, vali_loss_deno = 0, 0
        while True:
            target = h5f["indptr"][offset] + self.opt.batch_size
            if target < size:
                next_offset = h5f["rows"][target]
            else:
                next_offset = h5f["indptr"].shape[0] - 1
            indptr = h5f["indptr"][offset:next_offset + 1]
            beg, end = indptr[0], indptr[-1]
            indptr -= beg
            cols = h5f["cols"][beg:end]
            counts = h5f["counts"][beg:end]
            vali = (h5f["vali"][beg:end] < self.opt.vali_p).astype(np.bool)
            gamma = gamma_h5f[offset:next_offset, :]

            # call cuda kernel
            train_loss, vali_loss = \
              self.obj.feed_data(cols, indptr.astype(np.int32),
                                 vali, counts, gamma,
                                 epoch == 1 or self.opt.reuse_gamma,
                                 self.opt.num_iters_in_e_step)

            gamma_h5f[offset:next_offset, :] = gamma
            # accumulate loss
            train_loss_nume -= train_loss
            vali_loss_nume -= vali_loss
            train_loss_deno += np.sum(counts[~vali])
            vali_loss_deno += np.sum(counts[vali])
            train_loss = train_loss_nume / (train_loss_deno + EPS)
            vali_loss = vali_loss_nume / (vali_loss_deno + EPS)

            # update progress bar
            pbar.update(end,
                        values=[("train_loss", train_loss),
                                ("vali_loss", vali_loss)])
            offset = next_offset

            if end == size:
                break