Exemple #1
0
 def _build_sppmi(self, db, working_data_path, sppmi_total_lines, k):
     self.logger.info(f"build sppmi (shift k: {k})")
     sz = db.attrs["num_items"]
     nnz = 0
     self.logger.debug("sort working_data")
     aux.psort(working_data_path, key=1)
     w_path = aux.get_temporary_file(root=self.opt.data.tmp_dir)
     self.logger.debug(f"build sppmi in_parallel. w: {w_path}")
     num_workers = psutil.cpu_count()
     nnz = parallel_build_sppmi(working_data_path, w_path,
                                sppmi_total_lines, sz, k, num_workers)
     self.logger.debug(f"sort output. w: {w_path}, nnz: {nnz}")
     aux.psort(w_path)
     self.logger.info(f"convert from {working_data_path} to {w_path}")
     db.create_group("sppmi")
     db.attrs["sppmi_nnz"] = nnz
     self.logger.info(f"sppmi nnz: {nnz}")
     db["sppmi"].create_dataset("indptr", (sz, ),
                                dtype='int64',
                                maxshape=(sz, ))
     db["sppmi"].create_dataset("key", (nnz, ),
                                dtype='int32',
                                maxshape=(nnz, ))
     db["sppmi"].create_dataset("val", (nnz, ),
                                dtype='float32',
                                maxshape=(nnz, ))
     self.logger.info('Disk-based Compressing...')
     job_files = self._chunking_into_bins(w_path, nnz, sz, 0)
     self._build_compressed_triplets(db["sppmi"],
                                     job_files,
                                     num_lines=nnz,
                                     max_key=sz,
                                     is_colwise=0)
Exemple #2
0
 def _build_data(self,
                 db,
                 working_data_path,
                 validation_data,
                 target_groups=['rowwise', 'colwise'],
                 sort=True):
     available_mb = psutil.virtual_memory().available / 1024 / 1024
     approximated_data_mb = 0
     with open(working_data_path, 'rb') as fin:
         fin.seek(0, 2)
         approximated_data_mb = db.attrs['num_nnz'] * 3 * 4 / 1024 / 1024
     buffer_mb = int(max(1024, available_mb * 0.75))
     # for each sides
     for group, sep_idx, max_key in [('rowwise', 0, db.attrs['num_users']),
                                     ('colwise', 1, db.attrs['num_items'])]:
         if group not in target_groups:
             continue
         self.logger.info(f'Building compressed triplets for {group}...')
         self.logger.info('Preprocessing...')
         self.prepro.pre(db)
         if approximated_data_mb * 1.2 < available_mb:
             self.logger.info('In-memory Compressing ...')
             job_files = self._sort_and_compressed_binarization(
                 working_data_path,
                 db.attrs['num_nnz'],
                 max_key,
                 sort_key=sep_idx + 1 if sort else -1)
             self._load_compressed_triplet_bin(
                 db[group],
                 job_files,
                 num_lines=db.attrs['num_nnz'],
                 max_key=max_key,
                 is_colwise=sep_idx)
         else:
             self.logger.info('Disk-based Compressing...')
             if sort:
                 aux.psort(working_data_path,
                           tmp_dir=self.opt.data.tmp_dir,
                           key=sep_idx + 1,
                           buffer_mb=buffer_mb)
             job_files = self._chunking_into_bins(working_data_path,
                                                  db.attrs['num_nnz'],
                                                  max_key,
                                                  sep_idx=sep_idx)
             self._build_compressed_triplets(db[group],
                                             job_files,
                                             num_lines=db.attrs['num_nnz'],
                                             max_key=max_key,
                                             is_colwise=sep_idx)
         self.prepro.post(db[group])
         if group == 'rowwise':
             self.fill_validation_data(db, validation_data)
         self.logger.info('Finished')
Exemple #3
0
 def _build_sppmi(self, db, working_data_path, k):
     self.logger.info(f"build sppmi (shift k: {k})")
     appearances = {}
     nnz = 0
     aux.psort(working_data_path, key=1)
     with open(working_data_path, "r") as fin, \
         open(aux.get_temporary_file(root=self.opt.data.tmp_dir), "w") as w:
         D = sum(1 for line in fin)
         fin.seek(0)
         probe, chunk = "junk", []
         for line in fin:
             _w, _c = line.strip().split()
             if probe != _w:
                 appearances[probe] = len(chunk)
                 for __c, cnt in Counter(chunk).items():
                     if int(probe) < int(__c):
                         continue
                     pmi = np.log(cnt) + np.log(D) - \
                         np.log(appearances[probe]) - np.log(appearances[__c])
                     sppmi = pmi - np.log(k)
                     if sppmi > 0:
                         w.write(f"{probe} {__c} {sppmi}\n")
                         w.write(f"{__c} {probe} {sppmi}\n")
                         nnz += 2
                 probe, chunk = _w, []
             chunk.append(_c)
     aux.psort(w.name)
     self.logger.info(f"convert from {working_data_path} to {w.name}")
     db.create_group("sppmi")
     db.attrs["sppmi_nnz"] = nnz
     self.logger.info(f"sppmi nnz: {nnz}")
     sz = db.attrs["num_items"]
     db["sppmi"].create_dataset("indptr", (sz, ),
                                dtype='int64',
                                maxshape=(sz, ))
     db["sppmi"].create_dataset("key", (nnz, ),
                                dtype='int32',
                                maxshape=(nnz, ))
     db["sppmi"].create_dataset("val", (nnz, ),
                                dtype='float32',
                                maxshape=(nnz, ))
     self.logger.info('Disk-based Compressing...')
     job_files = self._chunking_into_bins(w.name, nnz, sz, 0)
     self._build_compressed_triplets(db["sppmi"],
                                     job_files,
                                     num_lines=nnz,
                                     max_key=sz,
                                     is_colwise=0)
Exemple #4
0
def prepare_dataset():
    logger = log.get_logger()
    if not os.path.isdir('ext/ml-100k/'):
        logger.warn('Cannot find the ./ext/ml-100k directory')
    else:
        if not os.path.isfile('./ext/ml-100k/main'):
            logger.info('preprocessing for matrix market format of ml-100k...')
            in_path = "./ext/ml-100k/u.data"
            stream_out_path = "./ext/ml-100k/stream"
            aux.psort(in_path, field_seperator="\t", key=4)
            aux.psort(in_path, field_seperator="\t", key=1)

            with open('./ext/ml-100k/main', 'w') as fout:
                fout.write(
                    '%%MatrixMarket matrix coordinate integer general\n%\n%\n943 1682 80000\n'
                )
                with open(in_path) as fin:
                    for line in fin:
                        u, i, v, ts = line.strip().split('\t')
                        fout.write('%s %s %s\n' % (u, i, v))

            iids = []
            with open('./ext/ml-100k/iid', 'w') as fout:
                with open('./ext/ml-100k/u.item',
                          encoding='ISO-8859-1') as fin:
                    iids = [
                        line.strip().split('|')[1].replace(' ', '_')
                        for line in fin
                    ]
                iids = [f"{idx}.{key}" for idx, key in enumerate(iids)]
                fout.write("\n".join(iids))

            with open('./ext/ml-100k/uid', 'w') as fout:
                for line in open('./ext/ml-100k/u.user'):
                    userid = line.strip().split('|')[0]
                    fout.write('%s\n' % userid)

            logger.info('preprocessing for stream format of ml-100k...')
            probe, bag = None, []
            with open(in_path, "r") as fin, open(stream_out_path, "w") as fout:
                for line in fin:
                    u, i, v, ts = line.strip().split("\t")
                    if not probe:
                        probe = u
                    elif probe != u:
                        fout.write(" ".join(bag) + "\n")
                        probe, bag = u, []
                    bag.append(iids[int(i) - 1])
                if bag:
                    fout.write(" ".join(bag))

    if not os.path.isdir('ext/ml-20m'):
        logger.warn('Cannot find the ./ml-20m directory')
    else:
        if not os.path.isfile('./ext/ml-20m/main'):
            logger.info('preprocessing for matrix market format of ml-20m...')
            uids, iids = {}, {}
            in_path = "./ext/ml-20m/ratings.csv"
            aux.psort(in_path, field_seperator=",", key=4)
            aux.psort(in_path, field_seperator=",", key=1)
            with open(in_path) as fin:
                fin.readline()
                for line in fin:
                    uid = line.split(',')[0]
                    if uid not in uids:
                        uids[uid] = len(uids) + 1
            with open('./ext/ml-20m/uid', 'w') as fout:
                for uid, _ in sorted(uids.items(), key=lambda x: x[1]):
                    fout.write('%s\n' % uid)
            with open('./ext/ml-20m/movies.csv') as fin:
                fin.readline()
                for line in fin:
                    iid = line.split(',')[0]
                    iids[iid] = len(iids) + 1
            with open('./ext/ml-20m/iid', 'w') as fout:
                for iid, _ in sorted(iids.items(), key=lambda x: x[1]):
                    fout.write('%s\n' % iid)
            with open('./ext/ml-20m/main', 'w') as fout:
                fout.write(
                    '%%MatrixMarket matrix coordinate real general\n%\n%\n138493 27278 20000263\n'
                )
                with open('./ext/ml-20m/ratings.csv') as fin:
                    fin.readline()
                    for line in fin:
                        uid, iid, r, *_ = line.split(',')
                        uid, iid = uids[uid], iids[iid]
                        fout.write(f'{uid} {iid} {r}\n')
            logger.info('preprocessing for stream format of ml-20m...')
            probe, bag = None, []
            stream_out_path = "./ext/ml-20m/stream"
            with open(in_path, "r") as fin, open(stream_out_path, "w") as fout:
                fin.readline()
                for line in fin:
                    u, i, v, ts = line.strip().split(",")
                    if not probe:
                        probe = u
                    elif probe != u:
                        fout.write(" ".join(bag) + "\n")
                        probe, bag = u, []
                    bag.append(i)
                if bag:
                    fout.write(" ".join(bag))
    if not os.path.isdir('ext/text8'):
        logger.warn('Cannot find the text8 directory')
    else:
        if not os.path.isfile('./ext/text8/main'):
            with open('./ext/text8/text8') as fin:
                words = fin.readline().strip().split()
                with open('./ext/text8/main', 'w') as fout:
                    for i in range(0, len(words), 1000):
                        fout.write('%s\n' % ' '.join(words[i:i + 1000]))

    if not os.path.isdir('brunch'):
        logger.warn('Cannot find the brunch directory')
    else:
        if not os.path.isfile('./ext/brunch/main'):
            os.makedirs('./ext/brunch/tmp', exist_ok=True)
            to_dir = './ext/brunch/tmp'

            logger.info('dividing...')
            num_chunks = 30
            fouts = {
                i: open(os.path.join(to_dir, str(i)), 'w')
                for i in range(num_chunks)
            }
            for path, fname in iterate_brunch_data_files('./ext/brunch'):
                for line in open(path):
                    uid = line.strip().split()[0]
                    fid = hash(uid) % num_chunks
                    fouts[fid].write(line)
            for val in fouts.values():
                val.close()

            logger.info('merging...')
            with open('./ext/brunch/main', 'w') as fout, \
                    open('./ext/brunch/uid', 'w') as fout_uid:
                for fid in fouts.keys():
                    seens = {}
                    chunk_path = os.path.join(to_dir, str(fid))
                    for line in open(chunk_path):
                        line = line.strip().split()
                        uid, seen = line[0], line[1:]
                        seens.setdefault(uid, []).extend(seen)
                    for uid, seen in seens.items():
                        fout.write(' '.join(seen) + '\n')
                        fout_uid.write(uid + '\n')
                for fid in fouts.keys():
                    chunk_path = os.path.join(to_dir, str(fid))
                    os.remove(chunk_path)
    make_mm_from_stream('./ext/brunch/', './ext/brunch/mm')
Exemple #5
0
def prepare_dataset():
    logger = log.get_logger()
    if not os.path.isdir('ext/ml-100k/'):
        logger.warn('Cannot find the ./ext/ml-100k directory')
    else:
        if not os.path.isfile('./ext/ml-100k/main'):
            logger.info('preprocessing for matrix market format of ml-100k...')
            in_path = "./ext/ml-100k/u.data"
            stream_out_path = "./ext/ml-100k/stream"
            aux.psort(in_path, field_seperator="\t", key=4)
            aux.psort(in_path, field_seperator="\t", key=1)

            with open('./ext/ml-100k/main', 'w') as fout:
                fout.write(
                    '%%MatrixMarket matrix coordinate integer general\n%\n%\n943 1682 80000\n'
                )
                with open(in_path) as fin:
                    for line in fin:
                        u, i, v, ts = line.strip().split('\t')
                        fout.write('%s %s %s\n' % (u, i, v))

            iids = []
            with open('./ext/ml-100k/iid', 'w') as fout:
                with open('./ext/ml-100k/u.item',
                          encoding='ISO-8859-1') as fin:
                    iids = [
                        line.strip().split('|')[1].replace(' ', '_')
                        for line in fin
                    ]
                iids = [f"{idx}.{key}" for idx, key in enumerate(iids)]
                fout.write("\n".join(iids))

            with open('./ext/ml-100k/uid', 'w') as fout:
                for line in open('./ext/ml-100k/u.user'):
                    userid = line.strip().split('|')[0]
                    fout.write('%s\n' % userid)

            logger.info('preprocessing for stream format of ml-100k...')
            probe, bag = None, []
            with open(in_path, "r") as fin, open(stream_out_path, "w") as fout:
                for line in fin:
                    u, i, v, ts = line.strip().split("\t")
                    if not probe:
                        probe = u
                    elif probe != u:
                        fout.write(" ".join(bag) + "\n")
                        probe, bag = u, []
                    bag.append(iids[int(i) - 1])
                if bag:
                    fout.write(" ".join(bag))

    if not os.path.isdir('ext/ml-20m'):
        logger.warn('Cannot find the ./ml-20m directory')
    else:
        if not os.path.isfile('./ext/ml-20m/main'):
            logger.info('preprocessing for matrix market format of ml-20m...')
            uids, iids = {}, {}
            in_path = "./ext/ml-20m/ratings.csv"
            aux.psort(in_path, field_seperator=",", key=4)
            aux.psort(in_path, field_seperator=",", key=1)
            with open(in_path) as fin:
                fin.readline()
                for line in fin:
                    uid = line.split(',')[0]
                    if uid not in uids:
                        uids[uid] = len(uids) + 1
            with open('./ext/ml-20m/uid', 'w') as fout:
                for uid, _ in sorted(uids.items(), key=lambda x: x[1]):
                    fout.write('%s\n' % uid)
            with open('./ext/ml-20m/movies.csv') as fin:
                fin.readline()
                for line in fin:
                    iid = line.split(',')[0]
                    iids[iid] = len(iids) + 1
            with open('./ext/ml-20m/iid', 'w') as fout:
                for iid, _ in sorted(iids.items(), key=lambda x: x[1]):
                    fout.write('%s\n' % iid)
            with open('./ext/ml-20m/main', 'w') as fout:
                fout.write(
                    '%%MatrixMarket matrix coordinate real general\n%\n%\n138493 27278 20000263\n'
                )
                with open('./ext/ml-20m/ratings.csv') as fin:
                    fin.readline()
                    for line in fin:
                        uid, iid, r, *_ = line.split(',')
                        uid, iid = uids[uid], iids[iid]
                        fout.write(f'{uid} {iid} {r}\n')
            logger.info('preprocessing for stream format of ml-20m...')
            probe, bag = None, []
            stream_out_path = "./ext/ml-20m/stream"
            with open(in_path, "r") as fin, open(stream_out_path, "w") as fout:
                fin.readline()
                for line in fin:
                    u, i, v, ts = line.strip().split(",")
                    if not probe:
                        probe = u
                    elif probe != u:
                        fout.write(" ".join(bag) + "\n")
                        probe, bag = u, []
                    bag.append(i)
                if bag:
                    fout.write(" ".join(bag))
    if not os.path.isdir('ext/text8'):
        logger.warn('Cannot find the text8 directory')
    else:
        if not os.path.isfile('./ext/text8/main'):
            with open('./ext/text8/text8') as fin:
                words = fin.readline().strip().split()
                with open('./ext/text8/main', 'w') as fout:
                    for i in range(0, len(words), 1000):
                        fout.write('%s\n' % ' '.join(words[i:i + 1000]))