def _build_sppmi(self, db, working_data_path, sppmi_total_lines, k): self.logger.info(f"build sppmi (shift k: {k})") sz = db.attrs["num_items"] nnz = 0 self.logger.debug("sort working_data") aux.psort(working_data_path, key=1) w_path = aux.get_temporary_file(root=self.opt.data.tmp_dir) self.logger.debug(f"build sppmi in_parallel. w: {w_path}") num_workers = psutil.cpu_count() nnz = parallel_build_sppmi(working_data_path, w_path, sppmi_total_lines, sz, k, num_workers) self.logger.debug(f"sort output. w: {w_path}, nnz: {nnz}") aux.psort(w_path) self.logger.info(f"convert from {working_data_path} to {w_path}") db.create_group("sppmi") db.attrs["sppmi_nnz"] = nnz self.logger.info(f"sppmi nnz: {nnz}") db["sppmi"].create_dataset("indptr", (sz, ), dtype='int64', maxshape=(sz, )) db["sppmi"].create_dataset("key", (nnz, ), dtype='int32', maxshape=(nnz, )) db["sppmi"].create_dataset("val", (nnz, ), dtype='float32', maxshape=(nnz, )) self.logger.info('Disk-based Compressing...') job_files = self._chunking_into_bins(w_path, nnz, sz, 0) self._build_compressed_triplets(db["sppmi"], job_files, num_lines=nnz, max_key=sz, is_colwise=0)
def _build_data(self, db, working_data_path, validation_data, target_groups=['rowwise', 'colwise'], sort=True): available_mb = psutil.virtual_memory().available / 1024 / 1024 approximated_data_mb = 0 with open(working_data_path, 'rb') as fin: fin.seek(0, 2) approximated_data_mb = db.attrs['num_nnz'] * 3 * 4 / 1024 / 1024 buffer_mb = int(max(1024, available_mb * 0.75)) # for each sides for group, sep_idx, max_key in [('rowwise', 0, db.attrs['num_users']), ('colwise', 1, db.attrs['num_items'])]: if group not in target_groups: continue self.logger.info(f'Building compressed triplets for {group}...') self.logger.info('Preprocessing...') self.prepro.pre(db) if approximated_data_mb * 1.2 < available_mb: self.logger.info('In-memory Compressing ...') job_files = self._sort_and_compressed_binarization( working_data_path, db.attrs['num_nnz'], max_key, sort_key=sep_idx + 1 if sort else -1) self._load_compressed_triplet_bin( db[group], job_files, num_lines=db.attrs['num_nnz'], max_key=max_key, is_colwise=sep_idx) else: self.logger.info('Disk-based Compressing...') if sort: aux.psort(working_data_path, tmp_dir=self.opt.data.tmp_dir, key=sep_idx + 1, buffer_mb=buffer_mb) job_files = self._chunking_into_bins(working_data_path, db.attrs['num_nnz'], max_key, sep_idx=sep_idx) self._build_compressed_triplets(db[group], job_files, num_lines=db.attrs['num_nnz'], max_key=max_key, is_colwise=sep_idx) self.prepro.post(db[group]) if group == 'rowwise': self.fill_validation_data(db, validation_data) self.logger.info('Finished')
def _build_sppmi(self, db, working_data_path, k): self.logger.info(f"build sppmi (shift k: {k})") appearances = {} nnz = 0 aux.psort(working_data_path, key=1) with open(working_data_path, "r") as fin, \ open(aux.get_temporary_file(root=self.opt.data.tmp_dir), "w") as w: D = sum(1 for line in fin) fin.seek(0) probe, chunk = "junk", [] for line in fin: _w, _c = line.strip().split() if probe != _w: appearances[probe] = len(chunk) for __c, cnt in Counter(chunk).items(): if int(probe) < int(__c): continue pmi = np.log(cnt) + np.log(D) - \ np.log(appearances[probe]) - np.log(appearances[__c]) sppmi = pmi - np.log(k) if sppmi > 0: w.write(f"{probe} {__c} {sppmi}\n") w.write(f"{__c} {probe} {sppmi}\n") nnz += 2 probe, chunk = _w, [] chunk.append(_c) aux.psort(w.name) self.logger.info(f"convert from {working_data_path} to {w.name}") db.create_group("sppmi") db.attrs["sppmi_nnz"] = nnz self.logger.info(f"sppmi nnz: {nnz}") sz = db.attrs["num_items"] db["sppmi"].create_dataset("indptr", (sz, ), dtype='int64', maxshape=(sz, )) db["sppmi"].create_dataset("key", (nnz, ), dtype='int32', maxshape=(nnz, )) db["sppmi"].create_dataset("val", (nnz, ), dtype='float32', maxshape=(nnz, )) self.logger.info('Disk-based Compressing...') job_files = self._chunking_into_bins(w.name, nnz, sz, 0) self._build_compressed_triplets(db["sppmi"], job_files, num_lines=nnz, max_key=sz, is_colwise=0)
def prepare_dataset(): logger = log.get_logger() if not os.path.isdir('ext/ml-100k/'): logger.warn('Cannot find the ./ext/ml-100k directory') else: if not os.path.isfile('./ext/ml-100k/main'): logger.info('preprocessing for matrix market format of ml-100k...') in_path = "./ext/ml-100k/u.data" stream_out_path = "./ext/ml-100k/stream" aux.psort(in_path, field_seperator="\t", key=4) aux.psort(in_path, field_seperator="\t", key=1) with open('./ext/ml-100k/main', 'w') as fout: fout.write( '%%MatrixMarket matrix coordinate integer general\n%\n%\n943 1682 80000\n' ) with open(in_path) as fin: for line in fin: u, i, v, ts = line.strip().split('\t') fout.write('%s %s %s\n' % (u, i, v)) iids = [] with open('./ext/ml-100k/iid', 'w') as fout: with open('./ext/ml-100k/u.item', encoding='ISO-8859-1') as fin: iids = [ line.strip().split('|')[1].replace(' ', '_') for line in fin ] iids = [f"{idx}.{key}" for idx, key in enumerate(iids)] fout.write("\n".join(iids)) with open('./ext/ml-100k/uid', 'w') as fout: for line in open('./ext/ml-100k/u.user'): userid = line.strip().split('|')[0] fout.write('%s\n' % userid) logger.info('preprocessing for stream format of ml-100k...') probe, bag = None, [] with open(in_path, "r") as fin, open(stream_out_path, "w") as fout: for line in fin: u, i, v, ts = line.strip().split("\t") if not probe: probe = u elif probe != u: fout.write(" ".join(bag) + "\n") probe, bag = u, [] bag.append(iids[int(i) - 1]) if bag: fout.write(" ".join(bag)) if not os.path.isdir('ext/ml-20m'): logger.warn('Cannot find the ./ml-20m directory') else: if not os.path.isfile('./ext/ml-20m/main'): logger.info('preprocessing for matrix market format of ml-20m...') uids, iids = {}, {} in_path = "./ext/ml-20m/ratings.csv" aux.psort(in_path, field_seperator=",", key=4) aux.psort(in_path, field_seperator=",", key=1) with open(in_path) as fin: fin.readline() for line in fin: uid = line.split(',')[0] if uid not in uids: uids[uid] = len(uids) + 1 with open('./ext/ml-20m/uid', 'w') as fout: for uid, _ in sorted(uids.items(), key=lambda x: x[1]): fout.write('%s\n' % uid) with open('./ext/ml-20m/movies.csv') as fin: fin.readline() for line in fin: iid = line.split(',')[0] iids[iid] = len(iids) + 1 with open('./ext/ml-20m/iid', 'w') as fout: for iid, _ in sorted(iids.items(), key=lambda x: x[1]): fout.write('%s\n' % iid) with open('./ext/ml-20m/main', 'w') as fout: fout.write( '%%MatrixMarket matrix coordinate real general\n%\n%\n138493 27278 20000263\n' ) with open('./ext/ml-20m/ratings.csv') as fin: fin.readline() for line in fin: uid, iid, r, *_ = line.split(',') uid, iid = uids[uid], iids[iid] fout.write(f'{uid} {iid} {r}\n') logger.info('preprocessing for stream format of ml-20m...') probe, bag = None, [] stream_out_path = "./ext/ml-20m/stream" with open(in_path, "r") as fin, open(stream_out_path, "w") as fout: fin.readline() for line in fin: u, i, v, ts = line.strip().split(",") if not probe: probe = u elif probe != u: fout.write(" ".join(bag) + "\n") probe, bag = u, [] bag.append(i) if bag: fout.write(" ".join(bag)) if not os.path.isdir('ext/text8'): logger.warn('Cannot find the text8 directory') else: if not os.path.isfile('./ext/text8/main'): with open('./ext/text8/text8') as fin: words = fin.readline().strip().split() with open('./ext/text8/main', 'w') as fout: for i in range(0, len(words), 1000): fout.write('%s\n' % ' '.join(words[i:i + 1000])) if not os.path.isdir('brunch'): logger.warn('Cannot find the brunch directory') else: if not os.path.isfile('./ext/brunch/main'): os.makedirs('./ext/brunch/tmp', exist_ok=True) to_dir = './ext/brunch/tmp' logger.info('dividing...') num_chunks = 30 fouts = { i: open(os.path.join(to_dir, str(i)), 'w') for i in range(num_chunks) } for path, fname in iterate_brunch_data_files('./ext/brunch'): for line in open(path): uid = line.strip().split()[0] fid = hash(uid) % num_chunks fouts[fid].write(line) for val in fouts.values(): val.close() logger.info('merging...') with open('./ext/brunch/main', 'w') as fout, \ open('./ext/brunch/uid', 'w') as fout_uid: for fid in fouts.keys(): seens = {} chunk_path = os.path.join(to_dir, str(fid)) for line in open(chunk_path): line = line.strip().split() uid, seen = line[0], line[1:] seens.setdefault(uid, []).extend(seen) for uid, seen in seens.items(): fout.write(' '.join(seen) + '\n') fout_uid.write(uid + '\n') for fid in fouts.keys(): chunk_path = os.path.join(to_dir, str(fid)) os.remove(chunk_path) make_mm_from_stream('./ext/brunch/', './ext/brunch/mm')
def prepare_dataset(): logger = log.get_logger() if not os.path.isdir('ext/ml-100k/'): logger.warn('Cannot find the ./ext/ml-100k directory') else: if not os.path.isfile('./ext/ml-100k/main'): logger.info('preprocessing for matrix market format of ml-100k...') in_path = "./ext/ml-100k/u.data" stream_out_path = "./ext/ml-100k/stream" aux.psort(in_path, field_seperator="\t", key=4) aux.psort(in_path, field_seperator="\t", key=1) with open('./ext/ml-100k/main', 'w') as fout: fout.write( '%%MatrixMarket matrix coordinate integer general\n%\n%\n943 1682 80000\n' ) with open(in_path) as fin: for line in fin: u, i, v, ts = line.strip().split('\t') fout.write('%s %s %s\n' % (u, i, v)) iids = [] with open('./ext/ml-100k/iid', 'w') as fout: with open('./ext/ml-100k/u.item', encoding='ISO-8859-1') as fin: iids = [ line.strip().split('|')[1].replace(' ', '_') for line in fin ] iids = [f"{idx}.{key}" for idx, key in enumerate(iids)] fout.write("\n".join(iids)) with open('./ext/ml-100k/uid', 'w') as fout: for line in open('./ext/ml-100k/u.user'): userid = line.strip().split('|')[0] fout.write('%s\n' % userid) logger.info('preprocessing for stream format of ml-100k...') probe, bag = None, [] with open(in_path, "r") as fin, open(stream_out_path, "w") as fout: for line in fin: u, i, v, ts = line.strip().split("\t") if not probe: probe = u elif probe != u: fout.write(" ".join(bag) + "\n") probe, bag = u, [] bag.append(iids[int(i) - 1]) if bag: fout.write(" ".join(bag)) if not os.path.isdir('ext/ml-20m'): logger.warn('Cannot find the ./ml-20m directory') else: if not os.path.isfile('./ext/ml-20m/main'): logger.info('preprocessing for matrix market format of ml-20m...') uids, iids = {}, {} in_path = "./ext/ml-20m/ratings.csv" aux.psort(in_path, field_seperator=",", key=4) aux.psort(in_path, field_seperator=",", key=1) with open(in_path) as fin: fin.readline() for line in fin: uid = line.split(',')[0] if uid not in uids: uids[uid] = len(uids) + 1 with open('./ext/ml-20m/uid', 'w') as fout: for uid, _ in sorted(uids.items(), key=lambda x: x[1]): fout.write('%s\n' % uid) with open('./ext/ml-20m/movies.csv') as fin: fin.readline() for line in fin: iid = line.split(',')[0] iids[iid] = len(iids) + 1 with open('./ext/ml-20m/iid', 'w') as fout: for iid, _ in sorted(iids.items(), key=lambda x: x[1]): fout.write('%s\n' % iid) with open('./ext/ml-20m/main', 'w') as fout: fout.write( '%%MatrixMarket matrix coordinate real general\n%\n%\n138493 27278 20000263\n' ) with open('./ext/ml-20m/ratings.csv') as fin: fin.readline() for line in fin: uid, iid, r, *_ = line.split(',') uid, iid = uids[uid], iids[iid] fout.write(f'{uid} {iid} {r}\n') logger.info('preprocessing for stream format of ml-20m...') probe, bag = None, [] stream_out_path = "./ext/ml-20m/stream" with open(in_path, "r") as fin, open(stream_out_path, "w") as fout: fin.readline() for line in fin: u, i, v, ts = line.strip().split(",") if not probe: probe = u elif probe != u: fout.write(" ".join(bag) + "\n") probe, bag = u, [] bag.append(i) if bag: fout.write(" ".join(bag)) if not os.path.isdir('ext/text8'): logger.warn('Cannot find the text8 directory') else: if not os.path.isfile('./ext/text8/main'): with open('./ext/text8/text8') as fin: words = fin.readline().strip().split() with open('./ext/text8/main', 'w') as fout: for i in range(0, len(words), 1000): fout.write('%s\n' % ' '.join(words[i:i + 1000]))