Esempio n. 1
0
 def optimize(self):
     opt = self.opt.optimize
     iters, max_trials = 0, opt.get('max_trials', -1)
     space = self._get_space(opt.space)
     with log.pbar(log.INFO,
                   desc='optimizing... ',
                   total=None if max_trials == -1 else max_trials,
                   mininterval=30) as pbar:
         tb_opt = None
         tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
         if opt.start_with_default_parameters:
             with log.supress_log_level(log.WARN):
                 loss = self._optimize({})
             self.logger.info(
                 f'Starting with default parameter result: {loss}')
             self._optimization_info['best'] = loss
         # NOTE: need better way
         tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
         self.initialize_tensorboard(
             1000000 if max_trials == -1 else max_trials,
             name_postfix='.optimize')
         tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
         while (max_trials):
             with log.supress_log_level(log.WARN):
                 best = fmin(fn=self._optimize,
                             space=space,
                             algo=tpe.suggest,
                             max_evals=len(
                                 self._optimization_info['trials'].trials) +
                             1,
                             trials=self._optimization_info['trials'],
                             show_progressbar=False)
             tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
             self.update_tensorboard_data(self._optimize_loss)
             tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
             iters += 1
             max_trials -= 1
             if self._optimization_info.get('best', {}).get(
                     'loss', 987654321) > self._optimize_loss['loss']:
                 is_first_time = self._optimization_info['best'] == {}
                 best = self._optimize_loss  # we cannot use return value of hyperopt due to randint behavior patch
                 self.logger.info(
                     f'Found new best parameters: {best} @ iter {iters}')
                 self._optimization_info['best'] = best
                 if opt.deployment and (is_first_time or not opt.min_trials
                                        or opt.min_trials >= iters):
                     if not self.opt.model_path:
                         raise RuntimeError(
                             'Failed to dump model: model path is not defined'
                         )
                     self.logger.info('Saving model... to {}'.format(
                         self.opt.model_path))
                     self.save(self.opt.model_path)
             if self.optimize_after_callback_fn:
                 self.optimize_after_callback_fn(self)
             pbar.update(1)
             self.logger.debug('Params({}) Losses({})'.format(
                 self._optimize_params, self._optimize_loss))
         tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
         self.finalize_tensorboard()
Esempio n. 2
0
    def _iterate(self, buf, group='rowwise'):
        header = self.data.get_header()
        # end = header['num_users'] if group == 'rowwise' else header['num_items']
        int_group = 0 if group == 'rowwise' else 1
        st = time.time()
        self.obj.precompute(int_group)
        el, st = time.time() - st, time.time()
        loss_nume, loss_deno = 0.0, 0.0
        update_t, feed_t, updated = el, 0, 0
        buf.set_group(group)
        with log.pbar(log.DEBUG,
                      desc='%s' % group,
                      total=header['num_nnz'],
                      mininterval=30) as pbar:
            for sz in buf.fetch_batch():
                updated += sz
                start_x, next_x, indptr, keys, vals = buf.get()
                _feed_t, st = time.time() - st, time.time()

                _loss_nume, _loss_deno = self.obj.partial_update(
                    start_x, next_x, indptr, keys, vals, int_group)
                loss_nume += _loss_nume
                loss_deno += _loss_deno

                _update_t, st = time.time() - st, time.time()
                pbar.update(sz)
                feed_t += _feed_t
                update_t += _update_t
        self.logger.debug(
            f'{group} updated: processed({updated}) elapsed(data feed: {feed_t:0.3f}s update: {update_t:0.03}s)'
        )
        return loss_nume, loss_deno
Esempio n. 3
0
 def _build_compressed_triplets(self,
                                db,
                                job_files,
                                num_lines,
                                max_key,
                                is_colwise=0):
     self.logger.info('Total job files: %s' % len(job_files))
     with log.pbar(log.INFO, total=len(job_files), mininterval=10) as pbar:
         indptr_index = 0
         data_index = 0
         RECORD_SIZE = 12
         prev_key = 0
         for job in job_files:
             with open(job, 'rb') as fin:
                 total_size = fin.seek(0, 2)
                 if total_size == 0:
                     continue
                 total_records = int(total_size / RECORD_SIZE)
                 fin.seek(0, 0)
                 data = np.fromstring(fin.read(),
                                      dtype=np.dtype([('u', 'i'),
                                                      ('i', 'i'),
                                                      ('v', 'f')]),
                                      count=total_records)
                 U, I, V = data['u'], data['i'], data['v']
                 if is_colwise:
                     U, I = I, U
                 U -= 1
                 I -= 1
                 V = self.value_prepro(V)
                 self.logger.debug("minU: {}, maxU: {}".format(U[0], U[-1]))
                 assert data_index + total_records <= num_lines, 'Requests data size(%s) exceed capacity(%s)' % (
                     data_index + total_records, num_lines)
                 db['key'][data_index:data_index + total_records] = I
                 db['val'][data_index:data_index + total_records] = V
                 indptr = [data_index for j in range(U[0] - prev_key)]
                 indptr += [
                     data_index + i for i in range(1, total_records)
                     for j in range(U[i] - U[i - 1])
                 ]
                 db['indptr'][indptr_index:indptr_index +
                              len(indptr)] = indptr
                 assert indptr_index + len(indptr) <= max_key
                 data_index += total_records
                 indptr_index += len(indptr)
                 prev_key = U[-1]
             pbar.update(1)
         db["indptr"][indptr_index:] = data_index
     for path in job_files:
         os.remove(path)
Esempio n. 4
0
    def _iterate(self):
        header = self.data.get_header()
        end = header['num_users']
        update_t, feed_t, updated = 0, 0, 0
        self.buf.set_group('rowwise')
        with log.pbar(log.DEBUG, total=header['num_nnz'],
                      mininterval=15) as pbar:
            start_t = time.time()
            for sz in self.buf.fetch_batch():
                updated += sz
                feed_t += time.time() - start_t
                start_x, next_x, indptr, keys = self.buf.get()

                start_t = time.time()
                self.obj.add_jobs(start_x, next_x, indptr, keys)
                update_t += time.time() - start_t
                pbar.update(sz)
        self.logger.debug(
            f'processed({updated}) elapsed(data feed: {feed_t:0.3f}s update: {update_t:0.3f})s'
        )
Esempio n. 5
0
    def _iterate(self, buf, group='user'):
        assert group in ["user", "item",
                         "context"], f"group {group} is not properly provided"
        header = self.data.get_scale_info(with_sppmi=True)
        err, update_t, feed_t, updated = 0, 0, 0, 0
        if group == "user":
            self.obj.precompute("item".encode("utf8"))
            total = header["num_nnz"]
            _groups = ["rowwise"]
        elif group == "item":
            self.obj.precompute("user".encode("utf8"))
            total = header["num_nnz"] + header["sppmi_nnz"]
            _groups = ["colwise", "sppmi"]
        elif group == "context":
            total = header["sppmi_nnz"]
            _groups = ["sppmi"]

        with log.pbar(log.DEBUG,
                      desc='%s' % group,
                      total=total,
                      mininterval=30) as pbar:
            st = time.time()
            for start_x, next_x in buf.fetch_batch_range(_groups):
                feed_t += time.time() - st
                _err, _updated, _update_t, _feed_t = \
                    self.partial_update(buf, group, start_x, next_x)
                update_t += _update_t
                updated += _updated
                err += _err
                pbar.update(_updated)
                st = time.time()
            pbar.refresh()
        self.logger.debug(
            f'updated {group} processed({updated}) elapsed(data feed: {feed_t:.3f} update: {update_t:.3f}")'
        )
        return err
Esempio n. 6
0
    def _create(self, data_path, P, H):
        def get_max_column_length(fname):
            with open(fname) as fin:
                max_col = 0
                for l in fin:
                    max_col = max(max_col, len(l))
            return max_col

        uid_path, iid_path, main_path = P['uid_path'], P['iid_path'], P[
            'main_path']
        num_users, num_items, num_nnz = map(int, H.split())
        # Manually updating progress bar is a bit naive
        with log.pbar(log.DEBUG, total=5, mininterval=30) as pbar:
            uid_max_col = len(str(num_users)) + 1
            if uid_path:
                uid_max_col = get_max_column_length(uid_path) + 1
            pbar.update(1)
            iid_max_col = len(str(num_items)) + 1
            if iid_path:
                iid_max_col = get_max_column_length(iid_path) + 1
            pbar.update(1)
            try:
                db = self._create_database(data_path,
                                           num_users=num_users,
                                           num_items=num_items,
                                           num_nnz=num_nnz,
                                           uid_max_col=uid_max_col,
                                           iid_max_col=iid_max_col)
                idmap = db['idmap']
                # if not given, assume id as is
                if uid_path:
                    with open(uid_path) as fin:
                        idmap['rows'][:] = np.loadtxt(fin,
                                                      dtype=f'S{uid_max_col}')
                else:
                    idmap['rows'][:] = np.array(
                        [str(i) for i in range(1, num_users + 1)],
                        dtype=f'S{uid_max_col}')
                pbar.update(1)
                if iid_path:
                    with open(iid_path) as fin:
                        idmap['cols'][:] = np.loadtxt(fin,
                                                      dtype=f'S{iid_max_col}')
                else:
                    idmap['cols'][:] = np.array(
                        [str(i) for i in range(1, num_items + 1)],
                        dtype=f'S{iid_max_col}')
                pbar.update(1)
                num_header_lines = 0
                with open(main_path) as fin:
                    for line in fin:
                        if line.strip().startswith('%'):
                            num_header_lines += 1
                        else:
                            break
                pbar.update(1)
            except Exception as e:
                self.logger.error('Cannot create db: %s' % (str(e)))
                self.logger.error(traceback.format_exc())
                raise
        return db, num_header_lines
Esempio n. 7
0
 def _create_working_data(self, db, source_path, ignore_lines):
     """
     Args:
         source_path: source data file path
         ignore_lines: number of lines to skip from start line
     """
     vali_indexes = [] if 'vali' not in db else db['vali']['indexes']
     vali_lines = []
     file_path = aux.get_temporary_file(self.opt.data.tmp_dir)
     with open(file_path, 'w') as w:
         fin = open(source_path, mode='r')
         file_size = fin.seek(0, 2)
         fin.seek(0, 0)
         for _ in range(ignore_lines):
             fin.readline()
         total = file_size - fin.tell()
         buffered = ''
         CHUNK_SIZE = 4096 * 1000
         total_lines = 0
         vali_indexes = sorted(vali_indexes)
         target_index = vali_indexes[0] if vali_indexes else -1
         vali_indexes = vali_indexes[1:]
         with log.pbar(log.INFO, total=total, mininterval=10) as pbar:
             while True:
                 buffered += fin.read(CHUNK_SIZE)
                 if buffered == '':
                     break
                 current_file_position = fin.tell()
                 pbar.update(CHUNK_SIZE)
                 num_lines_on_buffer = buffered.count('\n')
                 # search the position of validation sample and extract
                 # it from training data
                 while target_index >= 0 and target_index <= (
                         total_lines + num_lines_on_buffer):
                     no_line = total_lines
                     new_buffered = ''
                     from_index = 0
                     for idx, c in enumerate(buffered):
                         if c == '\n':
                             if no_line == target_index:
                                 vali_lines.append(buffered[from_index:idx])
                                 if from_index > 0:
                                     w.write(buffered[0:from_index])
                                 new_buffered = buffered[idx + 1:]
                                 no_line += 1
                                 total_lines += 1
                                 num_lines_on_buffer -= 1
                                 break
                             no_line += 1
                             total_lines += 1
                             from_index = idx + 1
                             num_lines_on_buffer -= 1
                     buffered = new_buffered
                     if vali_indexes:
                         target_index, vali_indexes = vali_indexes[
                             0], vali_indexes[1:]
                     else:
                         target_index = -1
                 where = buffered.rfind('\n')
                 total_lines += num_lines_on_buffer
                 if where != -1:
                     w.write(buffered[:where + 1])
                     buffered = buffered[where + 1:]
                 elif current_file_position == file_size:
                     w.write(buffered)
                     buffered = ''
         w.close()
         fin.close()
         return w.name, vali_lines