Ejemplo n.º 1
0
    def _iterate(self, buf, group='rowwise'):
        header = self.data.get_header()
        # end = header['num_users'] if group == 'rowwise' else header['num_items']
        int_group = 0 if group == 'rowwise' else 1
        st = time.time()
        self.obj.precompute(int_group)
        el, st = time.time() - st, time.time()
        loss_nume, loss_deno = 0.0, 0.0
        update_t, feed_t, updated = el, 0, 0
        buf.set_group(group)
        with log.ProgressBar(log.DEBUG,
                             desc='%s' % group,
                             total=header['num_nnz'],
                             mininterval=30) as pbar:
            for sz in buf.fetch_batch():
                updated += sz
                start_x, next_x, indptr, keys, vals = buf.get()
                _feed_t, st = time.time() - st, time.time()

                _loss_nume, _loss_deno = self.obj.partial_update(
                    start_x, next_x, indptr, keys, vals, int_group)
                loss_nume += _loss_nume
                loss_deno += _loss_deno

                _update_t, st = time.time() - st, time.time()
                pbar.update(sz)
                feed_t += _feed_t
                update_t += _update_t
        self.logger.debug(
            f'{group} updated: processed({updated}) elapsed(data feed: {feed_t:0.3f}s update: {update_t:0.03}s)'
        )
        return loss_nume, loss_deno
Ejemplo n.º 2
0
    def _create(self, data_path, P, H):
        def get_max_column_length(fname):
            with open(fname) as fin:
                max_col = 0
                for l in fin:
                    max_col = max(max_col, len(l))
            return max_col

        uid_path, iid_path, main_path = P['uid_path'], P['iid_path'], P['main_path']
        num_users, num_items, num_nnz = map(int, H.split())
        # Manually updating progress bar is a bit naive
        with log.ProgressBar(log.DEBUG, total=5, mininterval=30) as pbar:
            uid_max_col = len(str(num_users)) + 1
            if uid_path:
                uid_max_col = get_max_column_length(uid_path) + 1
            pbar.update(1)
            iid_max_col = len(str(num_items)) + 1
            if iid_path:
                iid_max_col = get_max_column_length(iid_path) + 1
            pbar.update(1)
            try:
                db = self._create_database(data_path,
                                           num_users=num_users,
                                           num_items=num_items,
                                           num_nnz=num_nnz,
                                           uid_max_col=uid_max_col,
                                           iid_max_col=iid_max_col)
                idmap = db['idmap']
                # if not given, assume id as is
                if uid_path:
                    with open(uid_path) as fin:
                        idmap['rows'][:] = np.loadtxt(fin, dtype=f'S{uid_max_col}')
                else:
                    idmap['rows'][:] = np.array([str(i) for i in range(1, num_users + 1)],
                                                dtype=f'S{uid_max_col}')
                pbar.update(1)
                if iid_path:
                    with open(iid_path) as fin:
                        idmap['cols'][:] = np.loadtxt(fin, dtype=f'S{iid_max_col}')
                else:
                    idmap['cols'][:] = np.array([str(i) for i in range(1, num_items + 1)],
                                                dtype=f'S{iid_max_col}')
                pbar.update(1)
                num_header_lines = 0
                with open(main_path) as fin:
                    for line in fin:
                        if line.strip().startswith('%'):
                            num_header_lines += 1
                        else:
                            break
                pbar.update(1)
            except Exception as e:
                self.logger.error('Cannot create db: %s' % (str(e)))
                self.logger.error(traceback.format_exc())
                raise
        return db, num_header_lines
Ejemplo n.º 3
0
 def _build_compressed_triplets(self,
                                db,
                                job_files,
                                num_lines,
                                max_key,
                                is_colwise=0):
     self.logger.info('Total job files: %s' % len(job_files))
     with log.ProgressBar(log.INFO, total=len(job_files),
                          mininterval=10) as pbar:
         indptr_index = 0
         data_index = 0
         RECORD_SIZE = 12
         prev_key = 0
         for job in job_files:
             with open(job, 'rb') as fin:
                 total_size = fin.seek(0, 2)
                 if total_size == 0:
                     continue
                 total_records = int(total_size / RECORD_SIZE)
                 fin.seek(0, 0)
                 data = np.frombuffer(fin.read(),
                                      dtype=np.dtype([('u', 'i'),
                                                      ('i', 'i'),
                                                      ('v', 'f')]),
                                      count=total_records)
                 U, I, V = data['u'], data['i'], data['v']
                 if is_colwise:
                     U, I = I, U
                 if self.opt.data.value_prepro:
                     V = self.value_prepro(V.copy())
                 self.logger.debug("minU: {}, maxU: {}".format(U[0], U[-1]))
                 assert data_index + total_records <= num_lines, 'Requests data size(%s) exceed capacity(%s)' % (
                     data_index + total_records, num_lines)
                 db['key'][data_index:data_index + total_records] = I
                 db['val'][data_index:data_index + total_records] = V
                 diff = U[1:] - U[:-1]
                 max_diff = np.amax(diff) if len(diff) else 0
                 indptr = [data_index for _ in range(U[0] - prev_key)]
                 for i in range(max_diff):
                     indptr += (np.where(diff > i)[0] + data_index +
                                1).tolist()
                 indptr.sort()
                 db['indptr'][indptr_index:indptr_index +
                              len(indptr)] = indptr
                 assert indptr_index + len(indptr) <= max_key
                 data_index += total_records
                 indptr_index += len(indptr)
                 prev_key = U[-1]
             pbar.update(1)
         db["indptr"][indptr_index:] = data_index
     for path in job_files:
         os.remove(path)
Ejemplo n.º 4
0
    def _iterate(self):
        header = self.data.get_header()
        # end = header['num_users']
        update_t, feed_t, updated = 0, 0, 0
        self.buf.set_group('rowwise')
        with log.ProgressBar(log.DEBUG,
                             total=header['num_nnz'], mininterval=15) as pbar:
            start_t = time.time()
            for sz in self.buf.fetch_batch():
                updated += sz
                feed_t += time.time() - start_t
                start_x, next_x, indptr, keys = self.buf.get()

                start_t = time.time()
                self.obj.add_jobs(start_x, next_x, indptr, keys)
                update_t += time.time() - start_t
                pbar.update(sz)
        self.logger.debug(f'processed({updated}) elapsed(data feed: {feed_t:0.3f}s update: {update_t:0.3f})s')
Ejemplo n.º 5
0
 def _create_working_data(self, db, source_path, ignore_lines):
     """
     Args:
         source_path: source data file path
         ignore_lines: number of lines to skip from start line
     """
     vali_indexes = [] if 'vali' not in db else db['vali']['indexes']
     vali_lines = []
     file_path = aux.get_temporary_file(self.opt.data.tmp_dir)
     with open(file_path, 'w') as w:
         fin = open(source_path, mode='r')
         file_size = fin.seek(0, 2)
         fin.seek(0, 0)
         for _ in range(ignore_lines):
             fin.readline()
         total = file_size - fin.tell()
         buffered = ''
         CHUNK_SIZE = 4096 * 1000
         total_lines = 0
         vali_indexes = sorted(vali_indexes)
         target_index = vali_indexes[0] if vali_indexes else -1
         vali_indexes = vali_indexes[1:]
         with log.ProgressBar(log.INFO, total=total, mininterval=10) as pbar:
             while True:
                 buffered += fin.read(CHUNK_SIZE)
                 if buffered == '':
                     break
                 current_file_position = fin.tell()
                 pbar.update(CHUNK_SIZE)
                 num_lines_on_buffer = buffered.count('\n')
                 # search the position of validation sample and extract
                 # it from training data
                 while target_index >= 0 and target_index <= (total_lines + num_lines_on_buffer):
                     no_line = total_lines
                     new_buffered = ''
                     from_index = 0
                     for idx, c in enumerate(buffered):
                         if c == '\n':
                             if no_line == target_index:
                                 vali_lines.append(buffered[from_index:idx])
                                 if from_index > 0:
                                     w.write(buffered[0:from_index])
                                 new_buffered = buffered[idx + 1:]
                                 no_line += 1
                                 total_lines += 1
                                 num_lines_on_buffer -= 1
                                 break
                             no_line += 1
                             total_lines += 1
                             from_index = idx + 1
                             num_lines_on_buffer -= 1
                     buffered = new_buffered
                     if vali_indexes:
                         target_index, vali_indexes = vali_indexes[0], vali_indexes[1:]
                     else:
                         target_index = -1
                 where = buffered.rfind('\n')
                 total_lines += num_lines_on_buffer
                 if where != -1:
                     w.write(buffered[:where + 1])
                     buffered = buffered[where + 1:]
                 elif current_file_position == file_size:
                     w.write(buffered)
                     buffered = ''
         w.close()
         fin.close()
         return w.name, vali_lines
Ejemplo n.º 6
0
    def _create(self, data_path, P):
        def get_max_column_length(fname):
            with open(fname) as fin:
                max_col = 0
                for l in fin:
                    max_col = max(max_col, len(l))
            return max_col

        uid_path, iid_path, main_path = P['uid_path'], P['iid_path'], P[
            'main_path']
        if uid_path:
            with open(uid_path) as fin:
                num_users = len([1 for _ in fin])
        else:
            with open(main_path) as fin:
                num_users = len([1 for _ in fin])

        uid_max_col = len(str(num_users)) + 1
        if uid_path:
            uid_max_col = get_max_column_length(uid_path) + 1

        vali_n = self.opt.data.validation.get('n', 0)
        num_nnz, vali_limit, itemids = 0, 0, set()
        self.logger.info(f'gathering itemids from {main_path}...')
        if self.opt.data.validation.name not in ["newest"]:
            vali_n = 0
        with open(main_path) as fin:
            for line in log.ProgressBar(level=log.DEBUG, iterable=fin):
                data = line.strip().split()
                if not iid_path:
                    itemids |= set(data)

                data_size = len(data)
                _vali_size = min(vali_n, len(data) - 1)
                vali_limit += _vali_size
                if self.opt.data.internal_data_type == 'stream':
                    num_nnz += (data_size - _vali_size)
                elif self.opt.data.internal_data_type == 'matrix':
                    num_nnz += len(set(data[:(data_size - _vali_size)]))
        if iid_path:
            with open(iid_path) as fin:
                itemids = {iid.strip(): idx + 1 for idx, iid in enumerate(fin)}
        else:  # in case of item information is not given
            itemids = {i: idx + 1 for idx, i in enumerate(itemids)}
        iid_max_col = max(len(k) + 1 for k in itemids.keys())
        num_items = len(itemids)

        self.logger.info('Found %d unique itemids' % len(itemids))

        try:
            db = self._create_database(data_path,
                                       num_users=num_users,
                                       num_items=num_items,
                                       num_nnz=num_nnz,
                                       uid_max_col=uid_max_col,
                                       iid_max_col=iid_max_col,
                                       num_validation_samples=vali_limit)
            idmap = db['idmap']
            # if not given, assume id as is
            if uid_path:
                with open(uid_path) as fin:
                    idmap['rows'][:] = np.loadtxt(fin, dtype=f'S{uid_max_col}')
            else:
                idmap['rows'][:] = np.array(
                    [str(i) for i in range(1, num_users + 1)],
                    dtype=f'S{uid_max_col}')
            if iid_path:
                with open(iid_path) as fin:
                    idmap['cols'][:] = np.loadtxt(fin, dtype=f'S{iid_max_col}')
            else:
                cols = sorted(itemids.items(), key=lambda x: x[1])
                cols = [k for k, _ in cols]
                idmap['cols'][:] = np.array(cols, dtype=f'S{iid_max_col}')
        except Exception as e:
            self.logger.error('Cannot create db: %s' % (str(e)))
            self.logger.error(traceback.format_exc())
            raise
        return db, itemids
Ejemplo n.º 7
0
    def _create_working_data(self,
                             db,
                             stream_main_path,
                             itemids,
                             with_sppmi=False,
                             windows=5):
        vali_method = None if 'vali' not in db else db['vali'].attrs['method']
        vali_indexes, vali_n = set(), 0
        if vali_method == 'sample':
            vali_indexes = set(db['vali']['indexes'])
        elif vali_method in ['newest']:
            vali_n = db['vali'].attrs['n']
        vali_lines = []
        # users = db['idmap']['rows'][:] will be used someday?
        sppmi_total_lines = 0

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ResourceWarning)
            if with_sppmi:
                w_sppmi = open(
                    aux.get_temporary_file(root=self.opt.data.tmp_dir), "w")
            file_path = aux.get_temporary_file(root=self.opt.data.tmp_dir)
            with open(stream_main_path) as fin, open(file_path, 'w') as w:
                total_index = 0
                internal_data_type = self.opt.data.internal_data_type
                for line_idx, data in log.ProgressBar(level=log.DEBUG,
                                                      iterable=enumerate(fin)):
                    data = data.strip().split()
                    # total_data_size = len(data) will be used someday?
                    user = line_idx + 1
                    vali_data, train_data = [], []
                    if vali_method in ['newest']:
                        vali_data_size = min(vali_n, len(data) - 1)
                        train_data_size = len(data) - vali_data_size
                        vali = data[train_data_size:]
                        data = data[:train_data_size]
                        for col, val in Counter(vali).items():
                            col = itemids[col]
                            vali_data.append(col)
                    if internal_data_type == 'stream':
                        for idx, col in enumerate(data):
                            col = itemids[col]
                            if (idx + total_index) in vali_indexes:
                                vali_data.append(col)
                            else:
                                train_data.append(col)
                    elif internal_data_type == 'matrix':
                        for idx, col in enumerate(data):
                            col = itemids[col]
                            if (idx + total_index) in vali_indexes:
                                vali_data.append(col)
                            else:
                                train_data.append(col)
                    total_index += len(data)
                    if internal_data_type == 'stream':
                        for col in train_data:
                            w.write(f'{user} {col} 1\n')
                        for col in vali_data:
                            vali_lines.append(f'{user} {col} {val}')
                    else:
                        for col, val in Counter(train_data).items():
                            w.write(f'{user} {col} {val}\n')
                        for col, val in Counter(vali_data).items():
                            vali_lines.append(f'{user} {col} {val}')
                    if with_sppmi:
                        sz = len(train_data)
                        for i in range(sz):
                            beg, end = i + 1, i + windows + 1
                            for j in range(beg, end):
                                if j >= sz:
                                    break
                                _w, _c = train_data[i], train_data[j]
                                w_sppmi.write(f'{_w} {_c}\n')
                                w_sppmi.write(f'{_c} {_w}\n')
                                sppmi_total_lines += 2
                if with_sppmi:
                    w_sppmi.close()
                    return w.name, vali_lines, w_sppmi.name, sppmi_total_lines
                return w.name, vali_lines, None, None
Ejemplo n.º 8
0
    def optimize(self):
        assert self.opt.evaluation_on_learning, \
            'evaluation must be set to be true to do hyperparameter optimization.'
        if self.opt.optimize.loss.startswith("val"):
            assert self.opt.validation, \
                'validation option must be set to be true to do hyperparameter optimization with validation results.'

        opt = self.opt.optimize
        iters, max_trials = 0, opt.get('max_trials', -1)
        space = self._get_space(opt.space)
        with log.ProgressBar(log.INFO, desc='optimizing... ',
                             total=None if max_trials == -1 else max_trials,
                             mininterval=30) as pbar:
            tb_opt = None
            tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
            if opt.start_with_default_parameters:
                with log.supress_log_level(log.WARN):
                    loss = self._optimize({})
                self.logger.info(f'Starting with default parameter result: {loss}')
                self._optimization_info['best'] = loss
                if opt.deployment:
                    self.logger.info('Saving model... to {}'.format(self.opt.model_path))
                    self.save(self.opt.model_path)
            # NOTE: need better way
            tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
            self.initialize_tensorboard(1000000 if max_trials == -1 else max_trials,
                                        name_postfix='.optimize')
            tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
            while(max_trials):
                with log.supress_log_level(log.WARN):
                    raw_best_parameters = fmin(fn=self._optimize,
                                               space=space,
                                               algo=tpe.suggest,
                                               max_evals=len(self._optimization_info['trials'].trials) + 1,
                                               trials=self._optimization_info['trials'],
                                               show_progressbar=False)
                tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
                self.update_tensorboard_data(self._optimize_loss)
                tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
                iters += 1
                max_trials -= 1
                if self._optimization_info.get('best', {}).get('loss', 987654321) > self._optimize_loss['loss']:
                    is_first_time = self._optimization_info['best'] == {}
                    best_parameters = space_eval(space, raw_best_parameters)
                    best_loss = self._optimize_loss  # we cannot use return value of hyperopt due to randint behavior patch
                    self.logger.info(f'Found new best parameters: {best_parameters} @ iter {iters}')
                    self._optimization_info['best'] = best_loss
                    self._optimization_info['best_parameters'] = best_parameters
                    if opt.deployment and (is_first_time or not opt.min_trials or opt.min_trials >= iters):
                        if not self.opt.model_path:
                            raise RuntimeError('Failed to dump model: model path is not defined')
                        self.logger.info('Saving model... to {}'.format(self.opt.model_path))
                        self.save(self.opt.model_path)
                if self.optimize_after_callback_fn:
                    self.optimize_after_callback_fn(self)
                pbar.update(1)
                self.logger.debug('Params({}) Losses({})'.format(self._optimize_params, self._optimize_loss))
            tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt  # trick
            self.finalize_tensorboard()

            return self.get_optimization_data()