Ejemplo n.º 1
0
def main():

    import os
    import logging
    import glob
    from optparse import OptionParser
    from collections import defaultdict

    from mrec import load_sparse_matrix
    from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate
    from mrec.evaluation import Evaluator
    from mrec.evaluation.metrics import print_report
    from filename_conventions import get_testfile, get_recsfile

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--test_input_format',dest='test_input_format',default='npz',help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary)  (default: %default)')
    parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
    parser.add_option('--recsdir',dest='recsdir',help='directory containing tsv files of precomputed recommendations')
    parser.add_option('--metrics',dest='metrics',default='main',help='which set of metrics to compute, main|hitrate (default: %default)')
    parser.add_option('--description',dest='description',help='description of model which generated the recommendations')
    metrics_funcs = {'main':compute_main_metrics,
                     'hitrate':compute_hit_rate}

    (opts,args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.recsdir \
            or opts.metrics not in metrics_funcs:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.recsdir = os.path.abspath(os.path.expanduser(opts.recsdir))

    evaluator = Evaluator(metrics_funcs[opts.metrics],max_items=20)

    trainfiles = glob.glob(opts.train)

    all_metrics = defaultdict(list)
    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        testfile = get_testfile(trainfile)
        recsfile = get_recsfile(trainfile,opts.recsdir)
        testdata = load_sparse_matrix(opts.test_input_format,testfile).tocsr()
        cum_metrics,count = evaluator.process(testdata,recsfile,0,testdata.shape[0])
        if cum_metrics is not None:
            for m in cum_metrics:
                all_metrics[m].append(float(cum_metrics[m])/count)

    print_report([opts.description],[all_metrics])
Ejemplo n.º 2
0
def main():
    import os
    from optparse import OptionParser
    from ipyparallel import Client

    from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate
    from mrec.evaluation import Evaluator
    from mrec.evaluation.metrics import print_report

    logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('--mb_per_task', dest='mb_per_task', type='int', default=None,
                      help='approximate memory limit per task in MB, so total memory usage is num_engines * mb_per_task (default: share all available RAM across engines)')
    parser.add_option('--input_format', dest='input_format',
                      help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--test_input_format', dest='test_input_format', default='npz',
                      help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary)  (default: %default)')
    parser.add_option('--train', dest='train',
                      help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
    parser.add_option('--modeldir', dest='modeldir', help='directory containing trained models')
    parser.add_option('--outdir', dest='outdir', help='directory for output files')
    parser.add_option('--metrics', dest='metrics', default='main',
                      help='which set of metrics to compute, main|hitrate (default: %default)')
    parser.add_option('--item_feature_format', dest='item_feature_format',
                      help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)')
    parser.add_option('--item_features', dest='item_features',
                      help='path to sparse item features in tsv format (item_id,feature_id,val)')
    parser.add_option('--overwrite', dest='overwrite', action='store_true', default=False,
                      help='overwrite existing files in outdir (default: %default)')
    parser.add_option('--packer', dest='packer', default='json', help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths', dest='add_module_paths',
                      help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)')

    metrics_funcs = {'main': compute_main_metrics,
                     'hitrate': compute_hit_rate}

    (opts, args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.outdir \
            or not opts.modeldir or opts.metrics not in metrics_funcs:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.modeldir = os.path.abspath(os.path.expanduser(opts.modeldir))
    opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir))

    # create an ipython client
    c = Client(packer=opts.packer)
    view = c.load_balanced_view()
    if opts.mb_per_task is None:
        import psutil
        num_engines = len(view)
        opts.mb_per_task = psutil.virtual_memory().available / ONE_MB / (num_engines + 1)  # don't take *all* the memory

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    evaluator = Evaluator(metrics_funcs[opts.metrics], max_items=20)

    trainfiles = glob.glob(opts.train)

    descriptions = set()
    all_metrics = defaultdict(list)
    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        modelfile = get_modelfile(trainfile, opts.modeldir)
        testfile = get_testfile(trainfile)
        description, metrics = process(view, opts, modelfile, trainfile, testfile, opts.item_features, opts.outdir,
                                       evaluator)
        descriptions.add(description)
        if metrics is not None:
            for m in metrics:
                all_metrics[m].append(metrics[m])

    description = ' AND '.join(descriptions)
    if len(descriptions) > 1:
        logging.warning('You are aggregating metrics from different models! {}'.format(description))

    print_report([description], [all_metrics])
Ejemplo n.º 3
0
def main():

    import os
    import logging
    import glob
    from optparse import OptionParser
    from collections import defaultdict

    from mrec import load_sparse_matrix
    from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate
    from mrec.evaluation import Evaluator
    from mrec.evaluation.metrics import print_report
    from filename_conventions import get_testfile, get_recsfile

    logging.basicConfig(level=logging.INFO,
                        format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option(
        '--input_format',
        dest='input_format',
        help=
        'format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)'
    )
    parser.add_option(
        '--test_input_format',
        dest='test_input_format',
        default='npz',
        help=
        'format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary)  (default: %default)'
    )
    parser.add_option(
        '--train',
        dest='train',
        help=
        'glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard'
    )
    parser.add_option(
        '--recsdir',
        dest='recsdir',
        help='directory containing tsv files of precomputed recommendations')
    parser.add_option(
        '--metrics',
        dest='metrics',
        default='main',
        help='which set of metrics to compute, main|hitrate (default: %default)'
    )
    parser.add_option(
        '--description',
        dest='description',
        help='description of model which generated the recommendations')
    metrics_funcs = {'main': compute_main_metrics, 'hitrate': compute_hit_rate}

    (opts, args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.recsdir \
            or opts.metrics not in metrics_funcs:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.recsdir = os.path.abspath(os.path.expanduser(opts.recsdir))

    evaluator = Evaluator(metrics_funcs[opts.metrics], max_items=20)

    trainfiles = glob.glob(opts.train)

    all_metrics = defaultdict(list)
    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        testfile = get_testfile(trainfile)
        recsfile = get_recsfile(trainfile, opts.recsdir)
        testdata = load_sparse_matrix(opts.test_input_format, testfile).tocsr()
        cum_metrics, count = evaluator.process(testdata, recsfile, 0,
                                               testdata.shape[0])
        if cum_metrics is not None:
            for m in cum_metrics:
                all_metrics[m].append(float(cum_metrics[m]) / count)

    print_report([opts.description], [all_metrics])
Ejemplo n.º 4
0
def main():

    import os
    from optparse import OptionParser
    from IPython.parallel import Client

    from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate
    from mrec.evaluation import Evaluator
    from mrec import load_recommender
    from mrec.evaluation.metrics import print_report

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    parser = OptionParser()
    parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use')
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--test_input_format',dest='test_input_format',default='npz',help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary)  (default: %default)')
    parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard')
    parser.add_option('--modeldir',dest='modeldir',help='directory containing trained models')
    parser.add_option('--outdir',dest='outdir',help='directory for output files')
    parser.add_option('--metrics',dest='metrics',default='main',help='which set of metrics to compute, main|hitrate (default: %default)')
    parser.add_option('--overwrite',dest='overwrite',action='store_true',default=False,help='overwrite existing files in outdir (default: %default)')
    parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)')

    metrics_funcs = {'main':compute_main_metrics,
                     'hitrate':compute_hit_rate}

    (opts,args) = parser.parse_args()
    if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines \
            or not opts.modeldir or opts.metrics not in metrics_funcs:
        parser.print_help()
        raise SystemExit

    opts.train = os.path.abspath(os.path.expanduser(opts.train))
    opts.modeldir = os.path.abspath(os.path.expanduser(opts.modeldir))
    opts.outdir = os.path.abspath(os.path.expanduser(opts.outdir))

    # create an ipython client
    c = Client(packer=opts.packer)
    view = c.load_balanced_view()

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    evaluator = Evaluator(metrics_funcs[opts.metrics],max_items=20)

    trainfiles = glob.glob(opts.train)

    descriptions = set()
    all_metrics = defaultdict(list)
    for trainfile in trainfiles:
        logging.info('processing {0}...'.format(trainfile))
        modelfile = get_modelfile(trainfile,opts.modeldir)
        testfile = get_testfile(trainfile)
        description,metrics = process(view,opts,modelfile,trainfile,testfile,opts.outdir,evaluator)
        descriptions.add(description)
        if metrics is not None:
            for m in metrics:
                all_metrics[m].append(metrics[m])

    description = ' AND '.join(descriptions)
    if len(descriptions) > 1:
        logging.warn('You are aggregating metrics from different models! {}'.format(description))

    print_report([description],[all_metrics])