コード例 #1
0
ファイル: aux.py プロジェクト: zzong2006/buffalo
def psort(path,
          parallel=-1,
          field_seperator=' ',
          key=1,
          tmp_dir='/tmp/',
          buffer_mb=1024,
          output=None):
    # TODO: We need better way for OS/platform compatibility.
    # we need compatibility checking routine for this method.
    commands = ['sort', '-n', '-s']
    if parallel == -1:
        parallel = psutil.cpu_count()
    if parallel > 0:
        commands.extend(['--parallel', parallel])
    if not output:
        output = path
    commands.extend(['-t', '{}'.format(field_seperator)])
    commands.extend(['-k', key])
    commands.extend(['-T', tmp_dir])
    commands.extend(['-S', '%sM' % buffer_mb])
    commands.extend(['-o', output])
    commands.append(path)
    try:
        subprocess.check_output(map(str, commands),
                                stderr=subprocess.STDOUT,
                                env={'LC_ALL': 'C'})
    except Exception as e:
        log.get_logger().error('Unexpected error: %s for %s' %
                               (str(e), ' '.join(list(map(str, commands)))))
        raise
コード例 #2
0
    def __init__(self, opt_path=None, *args, **kwargs):
        Algo.__init__(self, *args, **kwargs)
        BPRMFOption.__init__(self, *args, **kwargs)
        Evaluable.__init__(self, *args, **kwargs)
        Serializable.__init__(self, *args, **kwargs)
        Optimizable.__init__(self, *args, **kwargs)
        if opt_path is None:
            opt_path = BPRMFOption().get_default_option()

        self.logger = log.get_logger('BPRMF')
        self.opt, self.opt_path = self.get_option(opt_path)
        self.obj = CyBPRMF()
        assert self.obj.init(bytes(self.opt_path, 'utf-8')),\
            'cannot parse option file: %s' % opt_path
        self.data = None
        data = kwargs.get('data')
        data_opt = self.opt.get('data_opt')
        data_opt = kwargs.get('data_opt', data_opt)
        if data_opt:
            self.data = buffalo.data.load(data_opt)
            self.data.create()
        elif isinstance(data, Data):
            self.data = data
        self.logger.info('BPRMF(%s)' % json.dumps(self.opt, indent=2))
        if self.data:
            self.logger.info(self.data.show_info())
            assert self.data.data_type in ['matrix']
コード例 #3
0
    def __init__(self, opt_path=None, *args, **kwargs):
        Algo.__init__(self, *args, **kwargs)
        ALSOption.__init__(self, *args, **kwargs)
        Evaluable.__init__(self, *args, **kwargs)
        Serializable.__init__(self, *args, **kwargs)
        Optimizable.__init__(self, *args, **kwargs)
        if opt_path is None:
            opt_path = ALSOption().get_default_option()

        self.logger = log.get_logger('ALS')
        self.opt, self.opt_path = self.get_option(opt_path)
        if self.opt.accelerator and not inited_CUALS:
            self.logger.error("ImportError CuALS, no cuda library exists.")
            raise RuntimeError()
        self.obj = CuALS() if self.opt.accelerator else CyALS()
        assert self.obj.init(bytes(
            self.opt_path, 'utf-8')), 'cannot parse option file: %s' % opt_path

        self.data = None
        data = kwargs.get('data')
        data_opt = self.opt.get('data_opt')
        data_opt = kwargs.get('data_opt', data_opt)
        if data_opt:
            self.data = buffalo.data.load(data_opt)
            self.data.create()
        elif isinstance(data, Data):
            self.data = data
        self.logger.info('ALS(%s)' % json.dumps(self.opt, indent=2))
        if self.data:
            self.logger.info(self.data.show_info())
            assert self.data.data_type in ['matrix']
コード例 #4
0
ファイル: warp.py プロジェクト: zzong2006/buffalo
    def __init__(self, opt_path=None, *args, **kwargs):
        Algo.__init__(self, *args, **kwargs)
        WARPOption.__init__(self, *args, **kwargs)
        Evaluable.__init__(self, *args, **kwargs)
        Serializable.__init__(self, *args, **kwargs)
        Optimizable.__init__(self, *args, **kwargs)
        if opt_path is None:
            opt_path = WARPOption().get_default_option()

        self.logger = log.get_logger('WARP')
        self.opt, self.opt_path = self.get_option(opt_path)
        # TODO:GPU Implementation
        if self.opt.accelerator is True:
            raise NotImplementedError(
                "GPU version WARP is not implemented yet")
        self.obj = CyWARP()

        assert self.obj.init(bytes(self.opt_path, 'utf-8')),\
            'cannot parse option file: %s' % opt_path

        self.data = None
        data = kwargs.get('data')
        data_opt = self.opt.get('data_opt')
        data_opt = kwargs.get('data_opt', data_opt)
        if data_opt:
            self.data = buffalo.data.load(data_opt)
            self.data.create()
        elif isinstance(data, Data):
            self.data = data
        self.logger.info('WARP(%s)' % json.dumps(self.opt, indent=2))
        if self.data:
            self.logger.info(self.data.show_info())
            assert self.data.data_type in ['matrix']
コード例 #5
0
ファイル: w2v.py プロジェクト: ripingit/buffalo
    def __init__(self, opt_path=None, *args, **kwargs):
        Algo.__init__(self, *args, **kwargs)
        W2VOption.__init__(self, *args, **kwargs)
        Evaluable.__init__(self, *args, **kwargs)
        Serializable.__init__(self, *args, **kwargs)
        Optimizable.__init__(self, *args, **kwargs)
        if opt_path is None:
            opt_path = W2VOption().get_default_option()

        self.logger = log.get_logger('W2V')
        self.opt, self.opt_path = self.get_option(opt_path)
        self.obj = CyW2V()
        assert self.obj.init(bytes(self.opt_path, 'utf-8')), 'cannot parse option file: %s' % opt_path
        self.data = None
        data = kwargs.get('data')
        data_opt = self.opt.get('data_opt')
        data_opt = kwargs.get('data_opt', data_opt)
        if data_opt:
            self.data = buffalo.data.load(data_opt)
            assert self.data.data_type == 'stream'
            self.data.create()
        elif isinstance(data, Data):
            self.data = data
        self.logger.info('W2V(%s)' % json.dumps(self.opt, indent=2))
        if self.data:
            self.logger.info(self.data.show_info())
            assert self.data.data_type in ['stream']
        self._vocab = aux.Option({'size': 0,
                                  'index': None,
                                  'inv_index': None,
                                  'scale': None,
                                  'dist': None,
                                  'total_word_count': 0})
コード例 #6
0
ファイル: mm.py プロジェクト: ripingit/buffalo
 def __init__(self, opt, *args, **kwargs):
     super().__init__(opt, *args, **kwargs)
     self.name = 'MatrixMarket'
     self.logger = log.get_logger('MatrixMarket')
     if isinstance(self.value_prepro,
                   (prepro.SPPMI)):
         raise RuntimeError(f'{self.opt.data.value_prepro.name} does not support MatrixMarket')
     self.data_type = 'matrix'
コード例 #7
0
ファイル: mm.py プロジェクト: zzong2006/buffalo
    def get_main_path(self):
        main = self.opt.input.main
        if isinstance(main, (str,)):
            return main

        if hasattr(self, 'temp_main'):
            return self.temp_main

        log.get_logger('MatrixMarketDataReader').debug('creating temporary matrix-market data from numpy-kind array')
        tmp_path = aux.get_temporary_file(self.opt.data.tmp_dir)
        with open(tmp_path, 'wb') as fout:
            if isinstance(main, (np.ndarray,)) and main.ndim == 2:
                main = scipy.sparse.csr_matrix(main)
            if scipy.sparse.issparse(main):
                scipy.io.mmwrite(fout, main)
                self.temp_main = tmp_path
                return tmp_path
        raise RuntimeError(f'Unexpected data type for MatrixMarketOption.input.main field: {type(main)}')
コード例 #8
0
ファイル: base.py プロジェクト: zzong2006/buffalo
 def __init__(self, *args, **kwargs):
     Algo.__init__(self, *args, **kwargs)
     Optimizable.__init__(self, *args, **kwargs)
     TensorboardExtention.__init__(self, *args, **kwargs)
     self.logger = log.get_logger('MockAlgo')
     option = ALSOption().get_default_option()
     optimize_option = ALSOption().get_default_optimize_option()
     optimize_option.start_with_default_parameters = False
     option.optimize = optimize_option
     option.model_path = 'hello.world.bin'
     self.opt = option
     self._optimize_loss = {'loss': 987654321.0}
コード例 #9
0
    def __init__(self, opt_path=None, *args, **kwargs):
        Algo.__init__(self, *args, **kwargs)
        CFROption.__init__(self, *args, **kwargs)
        Evaluable.__init__(self, *args, **kwargs)
        Serializable.__init__(self, *args, **kwargs)
        Optimizable.__init__(self, *args, **kwargs)
        if opt_path is None:
            opt_path = CFROption().get_default_option()

        self.logger = log.get_logger('CFR')

        # put options into cython class with type assertion
        # see comments on options.py for the description of each parameter
        self.opt, self.opt_path = self.get_option(opt_path)
        self.obj = CyCFR()
        # check the validity of option
        self.is_valid_option(self.opt)
        assert self.obj.init(self.opt_path.encode(
            "utf8")), "putting parameter to cython object failed"

        # ensure embedding matrix is initialzed for preventing segmentation fault
        self.is_initialized = False

        self.data = None
        data = kwargs.get('data')
        data_opt = self.opt.get('data_opt')
        data_opt = kwargs.get('data_opt', data_opt)
        if data_opt:
            assert data_opt.data.internal_data_type == "matrix", \
                f"internal data type is {data_opt.data.internal_data_type}, not matrix"
            self.data = buffalo.data.load(data_opt)
            assert self.data.data_type == 'stream'
            self.data.create()
        elif isinstance(data, Data):
            self.data = data
        self.logger.info('CFR ({})'.format(json.dumps(self.opt, indent=2)))
        if self.data:
            self.logger.info(self.data.show_info())
            assert self.data.data_type in ['stream']
コード例 #10
0
ファイル: cli.py プロジェクト: zzong2006/buffalo
 def __init__(self):
     self.logger = log.get_logger('ALS')
コード例 #11
0
 def __init__(self):
     self.logger = log.get_logger('BufferedData')
コード例 #12
0
ファイル: als.py プロジェクト: younghai/buffalo
import buffalo.data
from buffalo.misc import aux, log
from buffalo.data.base import Data
from buffalo.algo._als import CyALS
from buffalo.evaluate import Evaluable
from buffalo.algo.options import ALSOption
from buffalo.algo.optimize import Optimizable
from buffalo.algo.tensorflow._als import TFALS
from buffalo.data.buffered_data import BufferedDataMatrix
from buffalo.algo.base import Algo, Serializable, TensorboardExtention

try:
    from buffalo.algo.cuda._als import CyALS as CuALS
except Exception as e:
    log.get_logger("system").error(
        f"ImportError CuALS, no cuda library exists. error message: {e}")
    CuALS = lambda x: ()


class ALS(Algo, ALSOption, Evaluable, Serializable, Optimizable,
          TensorboardExtention):
    """Python implementation for C-ALS.

    Implementation of Collaborative Filtering for Implicit Feedback datasets.

    Reference: http://yifanhu.net/PUB/cf.pdf"""
    def __init__(self, opt_path=None, *args, **kwargs):
        Algo.__init__(self, *args, **kwargs)
        ALSOption.__init__(self, *args, **kwargs)
        Evaluable.__init__(self, *args, **kwargs)
        Serializable.__init__(self, *args, **kwargs)
コード例 #13
0
ファイル: stream.py プロジェクト: soonhobahng/buffalo
 def __init__(self, opt, *args, **kwargs):
     super(Stream, self).__init__(opt, *args, **kwargs)
     self.name = 'Stream'
     self.logger = log.get_logger('Stream')
     self.data_type = 'stream'
コード例 #14
0
ファイル: preprocess.py プロジェクト: younghai/buffalo
def prepare_dataset():
    logger = log.get_logger()
    if not os.path.isdir('ext/ml-100k/'):
        logger.warn('Cannot find the ./ext/ml-100k directory')
    else:
        if not os.path.isfile('./ext/ml-100k/main'):
            logger.info('preprocessing for matrix market format of ml-100k...')
            in_path = "./ext/ml-100k/u.data"
            stream_out_path = "./ext/ml-100k/stream"
            aux.psort(in_path, field_seperator="\t", key=4)
            aux.psort(in_path, field_seperator="\t", key=1)

            with open('./ext/ml-100k/main', 'w') as fout:
                fout.write(
                    '%%MatrixMarket matrix coordinate integer general\n%\n%\n943 1682 80000\n'
                )
                with open(in_path) as fin:
                    for line in fin:
                        u, i, v, ts = line.strip().split('\t')
                        fout.write('%s %s %s\n' % (u, i, v))

            iids = []
            with open('./ext/ml-100k/iid', 'w') as fout:
                with open('./ext/ml-100k/u.item',
                          encoding='ISO-8859-1') as fin:
                    iids = [
                        line.strip().split('|')[1].replace(' ', '_')
                        for line in fin
                    ]
                iids = [f"{idx}.{key}" for idx, key in enumerate(iids)]
                fout.write("\n".join(iids))

            with open('./ext/ml-100k/uid', 'w') as fout:
                for line in open('./ext/ml-100k/u.user'):
                    userid = line.strip().split('|')[0]
                    fout.write('%s\n' % userid)

            logger.info('preprocessing for stream format of ml-100k...')
            probe, bag = None, []
            with open(in_path, "r") as fin, open(stream_out_path, "w") as fout:
                for line in fin:
                    u, i, v, ts = line.strip().split("\t")
                    if not probe:
                        probe = u
                    elif probe != u:
                        fout.write(" ".join(bag) + "\n")
                        probe, bag = u, []
                    bag.append(iids[int(i) - 1])
                if bag:
                    fout.write(" ".join(bag))

    if not os.path.isdir('ext/ml-20m'):
        logger.warn('Cannot find the ./ml-20m directory')
    else:
        if not os.path.isfile('./ext/ml-20m/main'):
            logger.info('preprocessing for matrix market format of ml-20m...')
            uids, iids = {}, {}
            in_path = "./ext/ml-20m/ratings.csv"
            aux.psort(in_path, field_seperator=",", key=4)
            aux.psort(in_path, field_seperator=",", key=1)
            with open(in_path) as fin:
                fin.readline()
                for line in fin:
                    uid = line.split(',')[0]
                    if uid not in uids:
                        uids[uid] = len(uids) + 1
            with open('./ext/ml-20m/uid', 'w') as fout:
                for uid, _ in sorted(uids.items(), key=lambda x: x[1]):
                    fout.write('%s\n' % uid)
            with open('./ext/ml-20m/movies.csv') as fin:
                fin.readline()
                for line in fin:
                    iid = line.split(',')[0]
                    iids[iid] = len(iids) + 1
            with open('./ext/ml-20m/iid', 'w') as fout:
                for iid, _ in sorted(iids.items(), key=lambda x: x[1]):
                    fout.write('%s\n' % iid)
            with open('./ext/ml-20m/main', 'w') as fout:
                fout.write(
                    '%%MatrixMarket matrix coordinate real general\n%\n%\n138493 27278 20000263\n'
                )
                with open('./ext/ml-20m/ratings.csv') as fin:
                    fin.readline()
                    for line in fin:
                        uid, iid, r, *_ = line.split(',')
                        uid, iid = uids[uid], iids[iid]
                        fout.write(f'{uid} {iid} {r}\n')
            logger.info('preprocessing for stream format of ml-20m...')
            probe, bag = None, []
            stream_out_path = "./ext/ml-20m/stream"
            with open(in_path, "r") as fin, open(stream_out_path, "w") as fout:
                fin.readline()
                for line in fin:
                    u, i, v, ts = line.strip().split(",")
                    if not probe:
                        probe = u
                    elif probe != u:
                        fout.write(" ".join(bag) + "\n")
                        probe, bag = u, []
                    bag.append(i)
                if bag:
                    fout.write(" ".join(bag))
    if not os.path.isdir('ext/text8'):
        logger.warn('Cannot find the text8 directory')
    else:
        if not os.path.isfile('./ext/text8/main'):
            with open('./ext/text8/text8') as fin:
                words = fin.readline().strip().split()
                with open('./ext/text8/main', 'w') as fout:
                    for i in range(0, len(words), 1000):
                        fout.write('%s\n' % ' '.join(words[i:i + 1000]))

    if not os.path.isdir('brunch'):
        logger.warn('Cannot find the brunch directory')
    else:
        if not os.path.isfile('./ext/brunch/main'):
            os.makedirs('./ext/brunch/tmp', exist_ok=True)
            to_dir = './ext/brunch/tmp'

            logger.info('dividing...')
            num_chunks = 30
            fouts = {
                i: open(os.path.join(to_dir, str(i)), 'w')
                for i in range(num_chunks)
            }
            for path, fname in iterate_brunch_data_files('./ext/brunch'):
                for line in open(path):
                    uid = line.strip().split()[0]
                    fid = hash(uid) % num_chunks
                    fouts[fid].write(line)
            for val in fouts.values():
                val.close()

            logger.info('merging...')
            with open('./ext/brunch/main', 'w') as fout, \
                    open('./ext/brunch/uid', 'w') as fout_uid:
                for fid in fouts.keys():
                    seens = {}
                    chunk_path = os.path.join(to_dir, str(fid))
                    for line in open(chunk_path):
                        line = line.strip().split()
                        uid, seen = line[0], line[1:]
                        seens.setdefault(uid, []).extend(seen)
                    for uid, seen in seens.items():
                        fout.write(' '.join(seen) + '\n')
                        fout_uid.write(uid + '\n')
                for fid in fouts.keys():
                    chunk_path = os.path.join(to_dir, str(fid))
                    os.remove(chunk_path)
    make_mm_from_stream('./ext/brunch/', './ext/brunch/mm')
コード例 #15
0
ファイル: preprocess.py プロジェクト: zzong2006/buffalo
def prepare_dataset():
    logger = log.get_logger()
    if not os.path.isdir('ext/ml-100k/'):
        logger.warn('Cannot find the ./ext/ml-100k directory')
    else:
        if not os.path.isfile('./ext/ml-100k/main'):
            logger.info('preprocessing for matrix market format of ml-100k...')
            in_path = "./ext/ml-100k/u.data"
            stream_out_path = "./ext/ml-100k/stream"
            aux.psort(in_path, field_seperator="\t", key=4)
            aux.psort(in_path, field_seperator="\t", key=1)

            with open('./ext/ml-100k/main', 'w') as fout:
                fout.write(
                    '%%MatrixMarket matrix coordinate integer general\n%\n%\n943 1682 80000\n'
                )
                with open(in_path) as fin:
                    for line in fin:
                        u, i, v, ts = line.strip().split('\t')
                        fout.write('%s %s %s\n' % (u, i, v))

            iids = []
            with open('./ext/ml-100k/iid', 'w') as fout:
                with open('./ext/ml-100k/u.item',
                          encoding='ISO-8859-1') as fin:
                    iids = [
                        line.strip().split('|')[1].replace(' ', '_')
                        for line in fin
                    ]
                iids = [f"{idx}.{key}" for idx, key in enumerate(iids)]
                fout.write("\n".join(iids))

            with open('./ext/ml-100k/uid', 'w') as fout:
                for line in open('./ext/ml-100k/u.user'):
                    userid = line.strip().split('|')[0]
                    fout.write('%s\n' % userid)

            logger.info('preprocessing for stream format of ml-100k...')
            probe, bag = None, []
            with open(in_path, "r") as fin, open(stream_out_path, "w") as fout:
                for line in fin:
                    u, i, v, ts = line.strip().split("\t")
                    if not probe:
                        probe = u
                    elif probe != u:
                        fout.write(" ".join(bag) + "\n")
                        probe, bag = u, []
                    bag.append(iids[int(i) - 1])
                if bag:
                    fout.write(" ".join(bag))

    if not os.path.isdir('ext/ml-20m'):
        logger.warn('Cannot find the ./ml-20m directory')
    else:
        if not os.path.isfile('./ext/ml-20m/main'):
            logger.info('preprocessing for matrix market format of ml-20m...')
            uids, iids = {}, {}
            in_path = "./ext/ml-20m/ratings.csv"
            aux.psort(in_path, field_seperator=",", key=4)
            aux.psort(in_path, field_seperator=",", key=1)
            with open(in_path) as fin:
                fin.readline()
                for line in fin:
                    uid = line.split(',')[0]
                    if uid not in uids:
                        uids[uid] = len(uids) + 1
            with open('./ext/ml-20m/uid', 'w') as fout:
                for uid, _ in sorted(uids.items(), key=lambda x: x[1]):
                    fout.write('%s\n' % uid)
            with open('./ext/ml-20m/movies.csv') as fin:
                fin.readline()
                for line in fin:
                    iid = line.split(',')[0]
                    iids[iid] = len(iids) + 1
            with open('./ext/ml-20m/iid', 'w') as fout:
                for iid, _ in sorted(iids.items(), key=lambda x: x[1]):
                    fout.write('%s\n' % iid)
            with open('./ext/ml-20m/main', 'w') as fout:
                fout.write(
                    '%%MatrixMarket matrix coordinate real general\n%\n%\n138493 27278 20000263\n'
                )
                with open('./ext/ml-20m/ratings.csv') as fin:
                    fin.readline()
                    for line in fin:
                        uid, iid, r, *_ = line.split(',')
                        uid, iid = uids[uid], iids[iid]
                        fout.write(f'{uid} {iid} {r}\n')
            logger.info('preprocessing for stream format of ml-20m...')
            probe, bag = None, []
            stream_out_path = "./ext/ml-20m/stream"
            with open(in_path, "r") as fin, open(stream_out_path, "w") as fout:
                fin.readline()
                for line in fin:
                    u, i, v, ts = line.strip().split(",")
                    if not probe:
                        probe = u
                    elif probe != u:
                        fout.write(" ".join(bag) + "\n")
                        probe, bag = u, []
                    bag.append(i)
                if bag:
                    fout.write(" ".join(bag))
    if not os.path.isdir('ext/text8'):
        logger.warn('Cannot find the text8 directory')
    else:
        if not os.path.isfile('./ext/text8/main'):
            with open('./ext/text8/text8') as fin:
                words = fin.readline().strip().split()
                with open('./ext/text8/main', 'w') as fout:
                    for i in range(0, len(words), 1000):
                        fout.write('%s\n' % ' '.join(words[i:i + 1000]))
コード例 #16
0
 def __init__(self, opt, name="tf_als"):
     self.logger = log.get_logger("tf-als")
     self.opt = opt
     self.name = name
     self.sess = tf.Session()
     self.graph = tf.get_default_graph()