def _build_sppmi(self, db, working_data_path, sppmi_total_lines, k): self.logger.info(f"build sppmi (shift k: {k})") sz = db.attrs["num_items"] nnz = 0 self.logger.debug("sort working_data") aux.psort(working_data_path, key=1) w_path = aux.get_temporary_file(root=self.opt.data.tmp_dir) self.logger.debug(f"build sppmi in_parallel. w: {w_path}") num_workers = psutil.cpu_count() nnz = parallel_build_sppmi(working_data_path, w_path, sppmi_total_lines, sz, k, num_workers) self.logger.debug(f"sort output. w: {w_path}, nnz: {nnz}") aux.psort(w_path) self.logger.info(f"convert from {working_data_path} to {w_path}") db.create_group("sppmi") db.attrs["sppmi_nnz"] = nnz self.logger.info(f"sppmi nnz: {nnz}") db["sppmi"].create_dataset("indptr", (sz, ), dtype='int64', maxshape=(sz, )) db["sppmi"].create_dataset("key", (nnz, ), dtype='int32', maxshape=(nnz, )) db["sppmi"].create_dataset("val", (nnz, ), dtype='float32', maxshape=(nnz, )) self.logger.info('Disk-based Compressing...') job_files = self._chunking_into_bins(w_path, nnz, sz, 0) self._build_compressed_triplets(db["sppmi"], job_files, num_lines=nnz, max_key=sz, is_colwise=0)
def _build_sppmi(self, db, working_data_path, k): self.logger.info(f"build sppmi (shift k: {k})") appearances = {} nnz = 0 aux.psort(working_data_path, key=1) with open(working_data_path, "r") as fin, \ open(aux.get_temporary_file(root=self.opt.data.tmp_dir), "w") as w: D = sum(1 for line in fin) fin.seek(0) probe, chunk = "junk", [] for line in fin: _w, _c = line.strip().split() if probe != _w: appearances[probe] = len(chunk) for __c, cnt in Counter(chunk).items(): if int(probe) < int(__c): continue pmi = np.log(cnt) + np.log(D) - \ np.log(appearances[probe]) - np.log(appearances[__c]) sppmi = pmi - np.log(k) if sppmi > 0: w.write(f"{probe} {__c} {sppmi}\n") w.write(f"{__c} {probe} {sppmi}\n") nnz += 2 probe, chunk = _w, [] chunk.append(_c) aux.psort(w.name) self.logger.info(f"convert from {working_data_path} to {w.name}") db.create_group("sppmi") db.attrs["sppmi_nnz"] = nnz self.logger.info(f"sppmi nnz: {nnz}") sz = db.attrs["num_items"] db["sppmi"].create_dataset("indptr", (sz, ), dtype='int64', maxshape=(sz, )) db["sppmi"].create_dataset("key", (nnz, ), dtype='int32', maxshape=(nnz, )) db["sppmi"].create_dataset("val", (nnz, ), dtype='float32', maxshape=(nnz, )) self.logger.info('Disk-based Compressing...') job_files = self._chunking_into_bins(w.name, nnz, sz, 0) self._build_compressed_triplets(db["sppmi"], job_files, num_lines=nnz, max_key=sz, is_colwise=0)
def _get_temporary_id_list_path(self, obj, name): field_name = f'temp_{name}' if hasattr(self, field_name): return getattr(self, field_name) tmp_path = aux.get_temporary_file(self.opt.data.tmp_dir) with open(tmp_path, 'w') as fout: if isinstance( obj, np.ndarray, ) and obj.ndim == 1: fout.write('\n'.join(map(str, obj.tolist()))) elif isinstance(obj, (list, )): fout.write('\n'.join(map(str, obj))) else: raise RuntimeError( f'Unexpected data type for id list: {type(obj)}') setattr(self, field_name, tmp_path) return tmp_path
def get_main_path(self): main = self.opt.input.main if isinstance(main, (str,)): return main if hasattr(self, 'temp_main'): return self.temp_main log.get_logger('MatrixMarketDataReader').debug('creating temporary matrix-market data from numpy-kind array') tmp_path = aux.get_temporary_file(self.opt.data.tmp_dir) with open(tmp_path, 'wb') as fout: if isinstance(main, (np.ndarray,)) and main.ndim == 2: main = scipy.sparse.csr_matrix(main) if scipy.sparse.issparse(main): scipy.io.mmwrite(fout, main) self.temp_main = tmp_path return tmp_path raise RuntimeError(f'Unexpected data type for MatrixMarketOption.input.main field: {type(main)}')
def _create_working_data(self, db, source_path, ignore_lines): """ Args: source_path: source data file path ignore_lines: number of lines to skip from start line """ vali_indexes = [] if 'vali' not in db else db['vali']['indexes'] vali_lines = [] file_path = aux.get_temporary_file(self.opt.data.tmp_dir) with open(file_path, 'w') as w: fin = open(source_path, mode='r') file_size = fin.seek(0, 2) fin.seek(0, 0) for _ in range(ignore_lines): fin.readline() total = file_size - fin.tell() buffered = '' CHUNK_SIZE = 4096 * 1000 total_lines = 0 vali_indexes = sorted(vali_indexes) target_index = vali_indexes[0] if vali_indexes else -1 vali_indexes = vali_indexes[1:] with log.ProgressBar(log.INFO, total=total, mininterval=10) as pbar: while True: buffered += fin.read(CHUNK_SIZE) if buffered == '': break current_file_position = fin.tell() pbar.update(CHUNK_SIZE) num_lines_on_buffer = buffered.count('\n') # search the position of validation sample and extract # it from training data while target_index >= 0 and target_index <= (total_lines + num_lines_on_buffer): no_line = total_lines new_buffered = '' from_index = 0 for idx, c in enumerate(buffered): if c == '\n': if no_line == target_index: vali_lines.append(buffered[from_index:idx]) if from_index > 0: w.write(buffered[0:from_index]) new_buffered = buffered[idx + 1:] no_line += 1 total_lines += 1 num_lines_on_buffer -= 1 break no_line += 1 total_lines += 1 from_index = idx + 1 num_lines_on_buffer -= 1 buffered = new_buffered if vali_indexes: target_index, vali_indexes = vali_indexes[0], vali_indexes[1:] else: target_index = -1 where = buffered.rfind('\n') total_lines += num_lines_on_buffer if where != -1: w.write(buffered[:where + 1]) buffered = buffered[where + 1:] elif current_file_position == file_size: w.write(buffered) buffered = '' w.close() fin.close() return w.name, vali_lines
def _create_working_data(self, db, stream_main_path, itemids, with_sppmi=False, windows=5): vali_method = None if 'vali' not in db else db['vali'].attrs['method'] vali_indexes, vali_n = set(), 0 if vali_method == 'sample': vali_indexes = set(db['vali']['indexes']) elif vali_method in ['newest']: vali_n = db['vali'].attrs['n'] vali_lines = [] users = db['idmap']['rows'][:] with warnings.catch_warnings(): warnings.simplefilter("ignore", ResourceWarning) if with_sppmi: w_sppmi = open( aux.get_temporary_file(root=self.opt.data.tmp_dir), "w") file_path = aux.get_temporary_file(root=self.opt.data.tmp_dir) with open(stream_main_path) as fin,\ open(file_path, 'w') as w: total_index = 0 internal_data_type = self.opt.data.internal_data_type for line_idx, data in log.iter_pbar(log_level=log.DEBUG, iterable=enumerate(fin)): data = data.strip().split() total_data_size = len(data) user = line_idx + 1 vali_data, train_data = [], [] if vali_method in ['newest']: vali_data_size = min(vali_n, len(data) - 1) train_data_size = len(data) - vali_data_size vali = data[train_data_size:] data = data[:train_data_size] for col, val in Counter(vali).items(): col = itemids[col] vali_data.append(col) if internal_data_type == 'stream': for idx, col in enumerate(data): col = itemids[col] if (idx + total_index) in vali_indexes: vali_data.append(col) else: train_data.append(col) elif internal_data_type == 'matrix': for idx, col in enumerate(data): col = itemids[col] if (idx + total_index) in vali_indexes: vali_data.append(col) else: train_data.append(col) total_index += len(data) if internal_data_type == 'stream': for col in train_data: w.write(f'{user} {col} 1\n') for col in vali_data: vali_lines.append(f'{user} {col} {val}') else: for col, val in Counter(train_data).items(): w.write(f'{user} {col} {val}\n') for col, val in Counter(vali_data).items(): vali_lines.append(f'{user} {col} {val}') if with_sppmi: sz = len(train_data) for i in range(sz): beg, end = i + 1, i + windows + 1 for j in range(beg, end): if j >= sz: break _w, _c = train_data[i], train_data[j] w_sppmi.write(f'{_w} {_c}\n') w_sppmi.write(f'{_c} {_w}\n') if with_sppmi: w_sppmi.close() return w.name, vali_lines, w_sppmi.name return w.name, vali_lines, None
def __init__(self, *args, **kwargs): self._optimization_info = {'trials': Trials(), 'best': {}} self._temporary_opt_file = aux.get_temporary_file() self.optimize_after_callback_fn = kwargs.get('optimize_after_callback_fn')