Example #1
0
 def _build_sppmi(self, db, working_data_path, sppmi_total_lines, k):
     self.logger.info(f"build sppmi (shift k: {k})")
     sz = db.attrs["num_items"]
     nnz = 0
     self.logger.debug("sort working_data")
     aux.psort(working_data_path, key=1)
     w_path = aux.get_temporary_file(root=self.opt.data.tmp_dir)
     self.logger.debug(f"build sppmi in_parallel. w: {w_path}")
     num_workers = psutil.cpu_count()
     nnz = parallel_build_sppmi(working_data_path, w_path,
                                sppmi_total_lines, sz, k, num_workers)
     self.logger.debug(f"sort output. w: {w_path}, nnz: {nnz}")
     aux.psort(w_path)
     self.logger.info(f"convert from {working_data_path} to {w_path}")
     db.create_group("sppmi")
     db.attrs["sppmi_nnz"] = nnz
     self.logger.info(f"sppmi nnz: {nnz}")
     db["sppmi"].create_dataset("indptr", (sz, ),
                                dtype='int64',
                                maxshape=(sz, ))
     db["sppmi"].create_dataset("key", (nnz, ),
                                dtype='int32',
                                maxshape=(nnz, ))
     db["sppmi"].create_dataset("val", (nnz, ),
                                dtype='float32',
                                maxshape=(nnz, ))
     self.logger.info('Disk-based Compressing...')
     job_files = self._chunking_into_bins(w_path, nnz, sz, 0)
     self._build_compressed_triplets(db["sppmi"],
                                     job_files,
                                     num_lines=nnz,
                                     max_key=sz,
                                     is_colwise=0)
Example #2
0
 def _build_sppmi(self, db, working_data_path, k):
     self.logger.info(f"build sppmi (shift k: {k})")
     appearances = {}
     nnz = 0
     aux.psort(working_data_path, key=1)
     with open(working_data_path, "r") as fin, \
         open(aux.get_temporary_file(root=self.opt.data.tmp_dir), "w") as w:
         D = sum(1 for line in fin)
         fin.seek(0)
         probe, chunk = "junk", []
         for line in fin:
             _w, _c = line.strip().split()
             if probe != _w:
                 appearances[probe] = len(chunk)
                 for __c, cnt in Counter(chunk).items():
                     if int(probe) < int(__c):
                         continue
                     pmi = np.log(cnt) + np.log(D) - \
                         np.log(appearances[probe]) - np.log(appearances[__c])
                     sppmi = pmi - np.log(k)
                     if sppmi > 0:
                         w.write(f"{probe} {__c} {sppmi}\n")
                         w.write(f"{__c} {probe} {sppmi}\n")
                         nnz += 2
                 probe, chunk = _w, []
             chunk.append(_c)
     aux.psort(w.name)
     self.logger.info(f"convert from {working_data_path} to {w.name}")
     db.create_group("sppmi")
     db.attrs["sppmi_nnz"] = nnz
     self.logger.info(f"sppmi nnz: {nnz}")
     sz = db.attrs["num_items"]
     db["sppmi"].create_dataset("indptr", (sz, ),
                                dtype='int64',
                                maxshape=(sz, ))
     db["sppmi"].create_dataset("key", (nnz, ),
                                dtype='int32',
                                maxshape=(nnz, ))
     db["sppmi"].create_dataset("val", (nnz, ),
                                dtype='float32',
                                maxshape=(nnz, ))
     self.logger.info('Disk-based Compressing...')
     job_files = self._chunking_into_bins(w.name, nnz, sz, 0)
     self._build_compressed_triplets(db["sppmi"],
                                     job_files,
                                     num_lines=nnz,
                                     max_key=sz,
                                     is_colwise=0)
Example #3
0
 def _get_temporary_id_list_path(self, obj, name):
     field_name = f'temp_{name}'
     if hasattr(self, field_name):
         return getattr(self, field_name)
     tmp_path = aux.get_temporary_file(self.opt.data.tmp_dir)
     with open(tmp_path, 'w') as fout:
         if isinstance(
                 obj,
                 np.ndarray,
         ) and obj.ndim == 1:
             fout.write('\n'.join(map(str, obj.tolist())))
         elif isinstance(obj, (list, )):
             fout.write('\n'.join(map(str, obj)))
         else:
             raise RuntimeError(
                 f'Unexpected data type for id list: {type(obj)}')
     setattr(self, field_name, tmp_path)
     return tmp_path
Example #4
0
    def get_main_path(self):
        main = self.opt.input.main
        if isinstance(main, (str,)):
            return main

        if hasattr(self, 'temp_main'):
            return self.temp_main

        log.get_logger('MatrixMarketDataReader').debug('creating temporary matrix-market data from numpy-kind array')
        tmp_path = aux.get_temporary_file(self.opt.data.tmp_dir)
        with open(tmp_path, 'wb') as fout:
            if isinstance(main, (np.ndarray,)) and main.ndim == 2:
                main = scipy.sparse.csr_matrix(main)
            if scipy.sparse.issparse(main):
                scipy.io.mmwrite(fout, main)
                self.temp_main = tmp_path
                return tmp_path
        raise RuntimeError(f'Unexpected data type for MatrixMarketOption.input.main field: {type(main)}')
Example #5
0
 def _create_working_data(self, db, source_path, ignore_lines):
     """
     Args:
         source_path: source data file path
         ignore_lines: number of lines to skip from start line
     """
     vali_indexes = [] if 'vali' not in db else db['vali']['indexes']
     vali_lines = []
     file_path = aux.get_temporary_file(self.opt.data.tmp_dir)
     with open(file_path, 'w') as w:
         fin = open(source_path, mode='r')
         file_size = fin.seek(0, 2)
         fin.seek(0, 0)
         for _ in range(ignore_lines):
             fin.readline()
         total = file_size - fin.tell()
         buffered = ''
         CHUNK_SIZE = 4096 * 1000
         total_lines = 0
         vali_indexes = sorted(vali_indexes)
         target_index = vali_indexes[0] if vali_indexes else -1
         vali_indexes = vali_indexes[1:]
         with log.ProgressBar(log.INFO, total=total, mininterval=10) as pbar:
             while True:
                 buffered += fin.read(CHUNK_SIZE)
                 if buffered == '':
                     break
                 current_file_position = fin.tell()
                 pbar.update(CHUNK_SIZE)
                 num_lines_on_buffer = buffered.count('\n')
                 # search the position of validation sample and extract
                 # it from training data
                 while target_index >= 0 and target_index <= (total_lines + num_lines_on_buffer):
                     no_line = total_lines
                     new_buffered = ''
                     from_index = 0
                     for idx, c in enumerate(buffered):
                         if c == '\n':
                             if no_line == target_index:
                                 vali_lines.append(buffered[from_index:idx])
                                 if from_index > 0:
                                     w.write(buffered[0:from_index])
                                 new_buffered = buffered[idx + 1:]
                                 no_line += 1
                                 total_lines += 1
                                 num_lines_on_buffer -= 1
                                 break
                             no_line += 1
                             total_lines += 1
                             from_index = idx + 1
                             num_lines_on_buffer -= 1
                     buffered = new_buffered
                     if vali_indexes:
                         target_index, vali_indexes = vali_indexes[0], vali_indexes[1:]
                     else:
                         target_index = -1
                 where = buffered.rfind('\n')
                 total_lines += num_lines_on_buffer
                 if where != -1:
                     w.write(buffered[:where + 1])
                     buffered = buffered[where + 1:]
                 elif current_file_position == file_size:
                     w.write(buffered)
                     buffered = ''
         w.close()
         fin.close()
         return w.name, vali_lines
Example #6
0
    def _create_working_data(self,
                             db,
                             stream_main_path,
                             itemids,
                             with_sppmi=False,
                             windows=5):
        vali_method = None if 'vali' not in db else db['vali'].attrs['method']
        vali_indexes, vali_n = set(), 0
        if vali_method == 'sample':
            vali_indexes = set(db['vali']['indexes'])
        elif vali_method in ['newest']:
            vali_n = db['vali'].attrs['n']
        vali_lines = []
        users = db['idmap']['rows'][:]

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ResourceWarning)
            if with_sppmi:
                w_sppmi = open(
                    aux.get_temporary_file(root=self.opt.data.tmp_dir), "w")
            file_path = aux.get_temporary_file(root=self.opt.data.tmp_dir)
            with open(stream_main_path) as fin,\
                open(file_path, 'w') as w:
                total_index = 0
                internal_data_type = self.opt.data.internal_data_type
                for line_idx, data in log.iter_pbar(log_level=log.DEBUG,
                                                    iterable=enumerate(fin)):
                    data = data.strip().split()
                    total_data_size = len(data)
                    user = line_idx + 1
                    vali_data, train_data = [], []
                    if vali_method in ['newest']:
                        vali_data_size = min(vali_n, len(data) - 1)
                        train_data_size = len(data) - vali_data_size
                        vali = data[train_data_size:]
                        data = data[:train_data_size]
                        for col, val in Counter(vali).items():
                            col = itemids[col]
                            vali_data.append(col)
                    if internal_data_type == 'stream':
                        for idx, col in enumerate(data):
                            col = itemids[col]
                            if (idx + total_index) in vali_indexes:
                                vali_data.append(col)
                            else:
                                train_data.append(col)
                    elif internal_data_type == 'matrix':
                        for idx, col in enumerate(data):
                            col = itemids[col]
                            if (idx + total_index) in vali_indexes:
                                vali_data.append(col)
                            else:
                                train_data.append(col)
                    total_index += len(data)
                    if internal_data_type == 'stream':
                        for col in train_data:
                            w.write(f'{user} {col} 1\n')
                        for col in vali_data:
                            vali_lines.append(f'{user} {col} {val}')
                    else:
                        for col, val in Counter(train_data).items():
                            w.write(f'{user} {col} {val}\n')
                        for col, val in Counter(vali_data).items():
                            vali_lines.append(f'{user} {col} {val}')
                    if with_sppmi:
                        sz = len(train_data)
                        for i in range(sz):
                            beg, end = i + 1, i + windows + 1
                            for j in range(beg, end):
                                if j >= sz:
                                    break
                                _w, _c = train_data[i], train_data[j]
                                w_sppmi.write(f'{_w} {_c}\n')
                                w_sppmi.write(f'{_c} {_w}\n')
                if with_sppmi:
                    w_sppmi.close()
                    return w.name, vali_lines, w_sppmi.name
                return w.name, vali_lines, None
Example #7
0
 def __init__(self, *args, **kwargs):
     self._optimization_info = {'trials': Trials(), 'best': {}}
     self._temporary_opt_file = aux.get_temporary_file()
     self.optimize_after_callback_fn = kwargs.get('optimize_after_callback_fn')