def _iterate(self, buf, group='rowwise'): header = self.data.get_header() # end = header['num_users'] if group == 'rowwise' else header['num_items'] int_group = 0 if group == 'rowwise' else 1 st = time.time() self.obj.precompute(int_group) el, st = time.time() - st, time.time() loss_nume, loss_deno = 0.0, 0.0 update_t, feed_t, updated = el, 0, 0 buf.set_group(group) with log.ProgressBar(log.DEBUG, desc='%s' % group, total=header['num_nnz'], mininterval=30) as pbar: for sz in buf.fetch_batch(): updated += sz start_x, next_x, indptr, keys, vals = buf.get() _feed_t, st = time.time() - st, time.time() _loss_nume, _loss_deno = self.obj.partial_update( start_x, next_x, indptr, keys, vals, int_group) loss_nume += _loss_nume loss_deno += _loss_deno _update_t, st = time.time() - st, time.time() pbar.update(sz) feed_t += _feed_t update_t += _update_t self.logger.debug( f'{group} updated: processed({updated}) elapsed(data feed: {feed_t:0.3f}s update: {update_t:0.03}s)' ) return loss_nume, loss_deno
def _create(self, data_path, P, H): def get_max_column_length(fname): with open(fname) as fin: max_col = 0 for l in fin: max_col = max(max_col, len(l)) return max_col uid_path, iid_path, main_path = P['uid_path'], P['iid_path'], P['main_path'] num_users, num_items, num_nnz = map(int, H.split()) # Manually updating progress bar is a bit naive with log.ProgressBar(log.DEBUG, total=5, mininterval=30) as pbar: uid_max_col = len(str(num_users)) + 1 if uid_path: uid_max_col = get_max_column_length(uid_path) + 1 pbar.update(1) iid_max_col = len(str(num_items)) + 1 if iid_path: iid_max_col = get_max_column_length(iid_path) + 1 pbar.update(1) try: db = self._create_database(data_path, num_users=num_users, num_items=num_items, num_nnz=num_nnz, uid_max_col=uid_max_col, iid_max_col=iid_max_col) idmap = db['idmap'] # if not given, assume id as is if uid_path: with open(uid_path) as fin: idmap['rows'][:] = np.loadtxt(fin, dtype=f'S{uid_max_col}') else: idmap['rows'][:] = np.array([str(i) for i in range(1, num_users + 1)], dtype=f'S{uid_max_col}') pbar.update(1) if iid_path: with open(iid_path) as fin: idmap['cols'][:] = np.loadtxt(fin, dtype=f'S{iid_max_col}') else: idmap['cols'][:] = np.array([str(i) for i in range(1, num_items + 1)], dtype=f'S{iid_max_col}') pbar.update(1) num_header_lines = 0 with open(main_path) as fin: for line in fin: if line.strip().startswith('%'): num_header_lines += 1 else: break pbar.update(1) except Exception as e: self.logger.error('Cannot create db: %s' % (str(e))) self.logger.error(traceback.format_exc()) raise return db, num_header_lines
def _build_compressed_triplets(self, db, job_files, num_lines, max_key, is_colwise=0): self.logger.info('Total job files: %s' % len(job_files)) with log.ProgressBar(log.INFO, total=len(job_files), mininterval=10) as pbar: indptr_index = 0 data_index = 0 RECORD_SIZE = 12 prev_key = 0 for job in job_files: with open(job, 'rb') as fin: total_size = fin.seek(0, 2) if total_size == 0: continue total_records = int(total_size / RECORD_SIZE) fin.seek(0, 0) data = np.frombuffer(fin.read(), dtype=np.dtype([('u', 'i'), ('i', 'i'), ('v', 'f')]), count=total_records) U, I, V = data['u'], data['i'], data['v'] if is_colwise: U, I = I, U if self.opt.data.value_prepro: V = self.value_prepro(V.copy()) self.logger.debug("minU: {}, maxU: {}".format(U[0], U[-1])) assert data_index + total_records <= num_lines, 'Requests data size(%s) exceed capacity(%s)' % ( data_index + total_records, num_lines) db['key'][data_index:data_index + total_records] = I db['val'][data_index:data_index + total_records] = V diff = U[1:] - U[:-1] max_diff = np.amax(diff) if len(diff) else 0 indptr = [data_index for _ in range(U[0] - prev_key)] for i in range(max_diff): indptr += (np.where(diff > i)[0] + data_index + 1).tolist() indptr.sort() db['indptr'][indptr_index:indptr_index + len(indptr)] = indptr assert indptr_index + len(indptr) <= max_key data_index += total_records indptr_index += len(indptr) prev_key = U[-1] pbar.update(1) db["indptr"][indptr_index:] = data_index for path in job_files: os.remove(path)
def _iterate(self): header = self.data.get_header() # end = header['num_users'] update_t, feed_t, updated = 0, 0, 0 self.buf.set_group('rowwise') with log.ProgressBar(log.DEBUG, total=header['num_nnz'], mininterval=15) as pbar: start_t = time.time() for sz in self.buf.fetch_batch(): updated += sz feed_t += time.time() - start_t start_x, next_x, indptr, keys = self.buf.get() start_t = time.time() self.obj.add_jobs(start_x, next_x, indptr, keys) update_t += time.time() - start_t pbar.update(sz) self.logger.debug(f'processed({updated}) elapsed(data feed: {feed_t:0.3f}s update: {update_t:0.3f})s')
def _create_working_data(self, db, source_path, ignore_lines): """ Args: source_path: source data file path ignore_lines: number of lines to skip from start line """ vali_indexes = [] if 'vali' not in db else db['vali']['indexes'] vali_lines = [] file_path = aux.get_temporary_file(self.opt.data.tmp_dir) with open(file_path, 'w') as w: fin = open(source_path, mode='r') file_size = fin.seek(0, 2) fin.seek(0, 0) for _ in range(ignore_lines): fin.readline() total = file_size - fin.tell() buffered = '' CHUNK_SIZE = 4096 * 1000 total_lines = 0 vali_indexes = sorted(vali_indexes) target_index = vali_indexes[0] if vali_indexes else -1 vali_indexes = vali_indexes[1:] with log.ProgressBar(log.INFO, total=total, mininterval=10) as pbar: while True: buffered += fin.read(CHUNK_SIZE) if buffered == '': break current_file_position = fin.tell() pbar.update(CHUNK_SIZE) num_lines_on_buffer = buffered.count('\n') # search the position of validation sample and extract # it from training data while target_index >= 0 and target_index <= (total_lines + num_lines_on_buffer): no_line = total_lines new_buffered = '' from_index = 0 for idx, c in enumerate(buffered): if c == '\n': if no_line == target_index: vali_lines.append(buffered[from_index:idx]) if from_index > 0: w.write(buffered[0:from_index]) new_buffered = buffered[idx + 1:] no_line += 1 total_lines += 1 num_lines_on_buffer -= 1 break no_line += 1 total_lines += 1 from_index = idx + 1 num_lines_on_buffer -= 1 buffered = new_buffered if vali_indexes: target_index, vali_indexes = vali_indexes[0], vali_indexes[1:] else: target_index = -1 where = buffered.rfind('\n') total_lines += num_lines_on_buffer if where != -1: w.write(buffered[:where + 1]) buffered = buffered[where + 1:] elif current_file_position == file_size: w.write(buffered) buffered = '' w.close() fin.close() return w.name, vali_lines
def _create(self, data_path, P): def get_max_column_length(fname): with open(fname) as fin: max_col = 0 for l in fin: max_col = max(max_col, len(l)) return max_col uid_path, iid_path, main_path = P['uid_path'], P['iid_path'], P[ 'main_path'] if uid_path: with open(uid_path) as fin: num_users = len([1 for _ in fin]) else: with open(main_path) as fin: num_users = len([1 for _ in fin]) uid_max_col = len(str(num_users)) + 1 if uid_path: uid_max_col = get_max_column_length(uid_path) + 1 vali_n = self.opt.data.validation.get('n', 0) num_nnz, vali_limit, itemids = 0, 0, set() self.logger.info(f'gathering itemids from {main_path}...') if self.opt.data.validation.name not in ["newest"]: vali_n = 0 with open(main_path) as fin: for line in log.ProgressBar(level=log.DEBUG, iterable=fin): data = line.strip().split() if not iid_path: itemids |= set(data) data_size = len(data) _vali_size = min(vali_n, len(data) - 1) vali_limit += _vali_size if self.opt.data.internal_data_type == 'stream': num_nnz += (data_size - _vali_size) elif self.opt.data.internal_data_type == 'matrix': num_nnz += len(set(data[:(data_size - _vali_size)])) if iid_path: with open(iid_path) as fin: itemids = {iid.strip(): idx + 1 for idx, iid in enumerate(fin)} else: # in case of item information is not given itemids = {i: idx + 1 for idx, i in enumerate(itemids)} iid_max_col = max(len(k) + 1 for k in itemids.keys()) num_items = len(itemids) self.logger.info('Found %d unique itemids' % len(itemids)) try: db = self._create_database(data_path, num_users=num_users, num_items=num_items, num_nnz=num_nnz, uid_max_col=uid_max_col, iid_max_col=iid_max_col, num_validation_samples=vali_limit) idmap = db['idmap'] # if not given, assume id as is if uid_path: with open(uid_path) as fin: idmap['rows'][:] = np.loadtxt(fin, dtype=f'S{uid_max_col}') else: idmap['rows'][:] = np.array( [str(i) for i in range(1, num_users + 1)], dtype=f'S{uid_max_col}') if iid_path: with open(iid_path) as fin: idmap['cols'][:] = np.loadtxt(fin, dtype=f'S{iid_max_col}') else: cols = sorted(itemids.items(), key=lambda x: x[1]) cols = [k for k, _ in cols] idmap['cols'][:] = np.array(cols, dtype=f'S{iid_max_col}') except Exception as e: self.logger.error('Cannot create db: %s' % (str(e))) self.logger.error(traceback.format_exc()) raise return db, itemids
def _create_working_data(self, db, stream_main_path, itemids, with_sppmi=False, windows=5): vali_method = None if 'vali' not in db else db['vali'].attrs['method'] vali_indexes, vali_n = set(), 0 if vali_method == 'sample': vali_indexes = set(db['vali']['indexes']) elif vali_method in ['newest']: vali_n = db['vali'].attrs['n'] vali_lines = [] # users = db['idmap']['rows'][:] will be used someday? sppmi_total_lines = 0 with warnings.catch_warnings(): warnings.simplefilter("ignore", ResourceWarning) if with_sppmi: w_sppmi = open( aux.get_temporary_file(root=self.opt.data.tmp_dir), "w") file_path = aux.get_temporary_file(root=self.opt.data.tmp_dir) with open(stream_main_path) as fin, open(file_path, 'w') as w: total_index = 0 internal_data_type = self.opt.data.internal_data_type for line_idx, data in log.ProgressBar(level=log.DEBUG, iterable=enumerate(fin)): data = data.strip().split() # total_data_size = len(data) will be used someday? user = line_idx + 1 vali_data, train_data = [], [] if vali_method in ['newest']: vali_data_size = min(vali_n, len(data) - 1) train_data_size = len(data) - vali_data_size vali = data[train_data_size:] data = data[:train_data_size] for col, val in Counter(vali).items(): col = itemids[col] vali_data.append(col) if internal_data_type == 'stream': for idx, col in enumerate(data): col = itemids[col] if (idx + total_index) in vali_indexes: vali_data.append(col) else: train_data.append(col) elif internal_data_type == 'matrix': for idx, col in enumerate(data): col = itemids[col] if (idx + total_index) in vali_indexes: vali_data.append(col) else: train_data.append(col) total_index += len(data) if internal_data_type == 'stream': for col in train_data: w.write(f'{user} {col} 1\n') for col in vali_data: vali_lines.append(f'{user} {col} {val}') else: for col, val in Counter(train_data).items(): w.write(f'{user} {col} {val}\n') for col, val in Counter(vali_data).items(): vali_lines.append(f'{user} {col} {val}') if with_sppmi: sz = len(train_data) for i in range(sz): beg, end = i + 1, i + windows + 1 for j in range(beg, end): if j >= sz: break _w, _c = train_data[i], train_data[j] w_sppmi.write(f'{_w} {_c}\n') w_sppmi.write(f'{_c} {_w}\n') sppmi_total_lines += 2 if with_sppmi: w_sppmi.close() return w.name, vali_lines, w_sppmi.name, sppmi_total_lines return w.name, vali_lines, None, None
def optimize(self): assert self.opt.evaluation_on_learning, \ 'evaluation must be set to be true to do hyperparameter optimization.' if self.opt.optimize.loss.startswith("val"): assert self.opt.validation, \ 'validation option must be set to be true to do hyperparameter optimization with validation results.' opt = self.opt.optimize iters, max_trials = 0, opt.get('max_trials', -1) space = self._get_space(opt.space) with log.ProgressBar(log.INFO, desc='optimizing... ', total=None if max_trials == -1 else max_trials, mininterval=30) as pbar: tb_opt = None tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick if opt.start_with_default_parameters: with log.supress_log_level(log.WARN): loss = self._optimize({}) self.logger.info(f'Starting with default parameter result: {loss}') self._optimization_info['best'] = loss if opt.deployment: self.logger.info('Saving model... to {}'.format(self.opt.model_path)) self.save(self.opt.model_path) # NOTE: need better way tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick self.initialize_tensorboard(1000000 if max_trials == -1 else max_trials, name_postfix='.optimize') tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick while(max_trials): with log.supress_log_level(log.WARN): raw_best_parameters = fmin(fn=self._optimize, space=space, algo=tpe.suggest, max_evals=len(self._optimization_info['trials'].trials) + 1, trials=self._optimization_info['trials'], show_progressbar=False) tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick self.update_tensorboard_data(self._optimize_loss) tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick iters += 1 max_trials -= 1 if self._optimization_info.get('best', {}).get('loss', 987654321) > self._optimize_loss['loss']: is_first_time = self._optimization_info['best'] == {} best_parameters = space_eval(space, raw_best_parameters) best_loss = self._optimize_loss # we cannot use return value of hyperopt due to randint behavior patch self.logger.info(f'Found new best parameters: {best_parameters} @ iter {iters}') self._optimization_info['best'] = best_loss self._optimization_info['best_parameters'] = best_parameters if opt.deployment and (is_first_time or not opt.min_trials or opt.min_trials >= iters): if not self.opt.model_path: raise RuntimeError('Failed to dump model: model path is not defined') self.logger.info('Saving model... to {}'.format(self.opt.model_path)) self.save(self.opt.model_path) if self.optimize_after_callback_fn: self.optimize_after_callback_fn(self) pbar.update(1) self.logger.debug('Params({}) Losses({})'.format(self._optimize_params, self._optimize_loss)) tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick self.finalize_tensorboard() return self.get_optimization_data()