Exemple #1
0
def test_yakonfig_cli():
    parser = argparse.ArgumentParser()
    yakonfig.parse_args(parser, [yakonfig], args=[])
    try:
        c = yakonfig.get_global_config()
        assert 'yakonfig' in c
    finally:
        yakonfig.clear_global_config()    
Exemple #2
0
def test_cli_good():
    parser = argparse.ArgumentParser()
    yakonfig.parse_args(parser, [ConfigurableArgs()], args=['--key', 'v'])
    try:
        c = yakonfig.get_global_config()
        assert 'config' in c
        assert 'k' in c['config']
        assert c['config']['k'] == 'v'
    finally:
        yakonfig.clear_global_config()
Exemple #3
0
def test_cli_file(request):
    yaml = str(request.fspath.dirpath('argconfig.yaml'))
    parser = argparse.ArgumentParser()
    yakonfig.parse_args(parser,
                        [yakonfig, ConfigurableArgs()],
                        args=['-c', yaml])
    try:
        c = yakonfig.get_global_config()
        assert 'config' in c
        assert 'k' in c['config']
        assert c['config']['k'] == 'x' # from the file
    finally:
        yakonfig.clear_global_config()
Exemple #4
0
def test_cli_overlay(request):
    # config.k is in the default, *and* the config file, *and* the command line
    yaml = str(request.fspath.dirpath('argconfig.yaml'))
    parser = argparse.ArgumentParser()
    yakonfig.parse_args(parser,
                        [yakonfig, ConfigurableArgs()],
                        args=['-c', yaml, '-k', 'y'])
    try:
        c = yakonfig.get_global_config()
        assert 'config' in c
        assert 'k' in c['config']
        assert c['config']['k'] == 'y' # from the command line
    finally:
        yakonfig.clear_global_config()
def main():
    '''Launch the AMQP worker.'''
    filters = {
        'already_labeled': already_labeled,
        'geotime': geotime,
    }

    worker = AMQPWorker(filters)
    parser = argparse.ArgumentParser()
    modules = [yakonfig, kvlayer, dblogger, coordinate, worker]
    yakonfig.parse_args(parser, modules)
    worker.task_master = coordinate.TaskMaster(
        yakonfig.get_global_config(coordinate.config_name))

    worker.start()
Exemple #6
0
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--host', default=None,  # NOT -h, that's help
                    help='host that coordinated will listen on, '
                    '0.0.0.0 for any input interface')
    ap.add_argument('--port', '-p', type=int, default=None,
                    help='port number that coordinated will listen on')
    ap.add_argument('--pid', default=None,
                    help='file to write pid to')
    ap.add_argument('--snapshot-dir', default=None,
                    help='direcotry to write snapshots to')
    ap.add_argument('--httpd', default=None,
                    help='ip:port or :port to serve http info on')
    if yappi is not None:
        ap.add_argument('--yappi', default=None, help='file to write yappi profiling to. will be suffied by {timestamp}.txt')
    args = yakonfig.parse_args(ap, [yakonfig, dblogger, coordinate])

    if args.pid:
        with open(args.pid, 'w') as f:
            f.write(str(os.getpid()))

    if args.snapshot_dir is not None:
        cjqconfig = yakonfig.get_global_config('coordinate', 'job_queue')
        # (This modifies the global configuration in place)
        cjqconfig['snapshot_path_format'] = os.path.join(
            args.snapshot_dir, 'snapshot_{timestamp}')

    if (yappi is not None) and args.yappi:
        yappi.start()
        yt = threading.Thread(target=yappi_logger, args=(args.yappi,))
        yt.daemon = True
        yt.start()

    daemon = CoordinateServer(host=args.host, port=args.port, httpd=args.httpd)
    daemon.run()
Exemple #7
0
def main():
    p = argparse.ArgumentParser(
        'simple debugging tool for watching the linker and OpenQuery')
    p.add_argument('action', help='either `run` or `cache` or `delete`')
    p.add_argument('folder', help='folder name')
    p.add_argument('subfolder', help='subfolder name')
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])

    config = yakonfig.get_global_config()

    key = cbor.dumps(
        (args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_')))

    if args.action == 'run':
        web_conf = Config()
        with yakonfig.defaulted_config([kvlayer, dblogger, web_conf],
                                       config=config):
            traverse_extract_fetch(web_conf, key, stop_after_extraction=True)

    elif args.action == 'delete':
        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str, )})
        kvlclient.delete('openquery', (key, ))
        print('deleted %r' % key)

    elif args.action == 'cache':

        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str, )})
        count = 0
        for rec in kvlclient.scan('openquery'):
            count += 1
            if rec[0][0] == key:
                print rec
        print('%d cached queries' % count)
Exemple #8
0
def main():
    p = argparse.ArgumentParser(
        description='Specific utilities for working with the ad corpus.')
    app = App()
    app.add_arguments(p)
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])
    app.main(args)
Exemple #9
0
def main():
    p = argparse.ArgumentParser(
        description='Interact with DossierStack truth data.')
    app = App()
    app.add_arguments(p)
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])
    app.main(args)
Exemple #10
0
def default_app():
    config = Config()
    p = argparse.ArgumentParser(description='Run DossierStack web services.')
    add_cli_arguments(p)
    args = yakonfig.parse_args(p, [config, dblogger, kvlayer, yakonfig])
    app = WebBuilder().set_config(config).enable_cors().get_app()
    return args, app
Exemple #11
0
def main():
    p = argparse.ArgumentParser(
        description='Interact with the Dossier feature collection store.')
    app = App()
    app.add_arguments(p)
    args = yakonfig.parse_args(p, [kvlayer, yakonfig, Store])
    app.main(args)
Exemple #12
0
def main():
    p = argparse.ArgumentParser(
        description='Interact with DossierStack truth data.')
    app = App()
    app.add_arguments(p)
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])
    app.main(args)
Exemple #13
0
def test_check_toplevel_args():
    '''check_config_toplevel() should work in check_config() implementation'''
    parser = argparse.ArgumentParser()
    yakonfig.parse_args(parser, [ConfigurableArgs(), Dependent],
                        ['--key', 'k'])
    try:
        config = yakonfig.get_global_config()
        assert sorted(iterkeys(config)) == ['config', 'dependent']
        assert config['config']['k'] == 'k'
    finally:
        yakonfig.clear_global_config()
    parser = argparse.ArgumentParser()
    with pytest.raises(SystemExit):
        yakonfig.parse_args(parser, [ConfigurableArgs(), Dependent],
                            ['--key', 'key'])
        yakonfig.clear_global_config()
Exemple #14
0
def main():
    p = argparse.ArgumentParser(
        description='Specific utilities for working with the ad corpus.')
    app = App()
    app.add_arguments(p)
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])
    app.main(args)
Exemple #15
0
def main():
    p = argparse.ArgumentParser(
        description='Utilities for generating FCs from artifacts.')
    app = App()
    app.add_arguments(p)
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])
    app.main(args)
Exemple #16
0
def main():
    parser = argparse.ArgumentParser()
    app = CoordinateC()
    app.add_arguments(parser)
    args = yakonfig.parse_args(parser, [yakonfig, dblogger, coordinate])
    app.main(args)
    sys.exit(app.exitcode)
Exemple #17
0
def main():
    p = argparse.ArgumentParser('simple debugging tool for watching the linker and OpenQuery')
    p.add_argument('action', help='either `run` or `cache` or `delete`')
    p.add_argument('folder', help='folder name')
    p.add_argument('subfolder', help='subfolder name')
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])

    config = yakonfig.get_global_config()

    key = cbor.dumps((args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_')))

    if args.action == 'run':
        web_conf = Config()
        with yakonfig.defaulted_config([kvlayer, dblogger, web_conf], config=config):
            traverse_extract_fetch(web_conf, key, stop_after_extraction=True)

    elif args.action == 'delete':
        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str,)})
        kvlclient.delete('openquery', (key,))
        print('deleted %r' % key)

    elif args.action == 'cache':

        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str,)})
        count = 0
        for rec in kvlclient.scan('openquery'):
            count += 1
            if rec[0][0] == key:
                print rec
        print('%d cached queries' % count)
Exemple #18
0
def main(options):
    """Run the recommender system on a sequence of topics.
    """
    description = "System using LDA, Kmeans and Solr to optimize diversification and exploitation of different topics"
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("--overwrite", action="store_true")
    args = yakonfig.parse_args(parser, [yakonfig])

    logging.basicConfig(level=logging.DEBUG)

    config = yakonfig.get_global_config("harness")
    batch_size = config.get("batch_size", 5)
    run_file_path = config["run_file_path"]
    if os.path.exists(run_file_path):
        if args.overwrite:
            os.remove(run_file_path)
        else:
            os.remove(run_file_path)
            # sys.exit('%r already exists' % run_file_path)

    kvl_config = {"storage_type": "local", "namespace": "test", "app_name": "test"}
    kvl = kvlayer.client(kvl_config)
    method, feedback_options, poids, id_config = options[0], options[1], options[2], options[3]
    print method, poids
    system = SearchSystem([], method, feedback_options, poids)
    print args.config
    args.config = "config" + str(id_config) + ".yaml"
    print args.config
    ambassador = HarnessAmbassadorCLI(system, args.config, batch_size)
    ambassador.run()
def main():
    parser = argparse.ArgumentParser(
        'command line tool for debugging and development')
    parser.add_argument('corpus', help='path to a streamcorpus.Chunk file')
    parser.add_argument('-n', '--num-tokens', default=6, type=int,
                        help='the n of the ngrams; used as start_num_tokens '
                             'for scanning')
    parser.add_argument('--max-num-tokens', default=40, type=int,
                        help='maximum number of `n` in n-grams for scanning')
    parser.add_argument('--peak-score-delta', default=0.01, type=float,
                        help='delta in score values required between first '
                             'and second result to stop  scanning')
    parser.add_argument('--scan-window-size', default=False,
                        action='store_true',
                        help='if set, scans from the value of -n until it '
                             'finds a strongly peaked top value')
    parser.add_argument('--filter-punctuation', default=False,
                        action='store_true',
                        help='filter out punctuation; default is to not '
                             'filter punctuation')
    parser.add_argument('--show-ids', default=False, action='store_true',
                        help='show identifiers in diagnostic output')
    args = yakonfig.parse_args(parser, [yakonfig, dblogger])

    ## TODO: if we start needing to load FC chunk files (instead of SI
    ## chunk files), this might need to be told which kind of chunk it
    ## is loading, and we'll need a second function along the lines of
    ## ids_and_clean_visible_from_streamcorpus_chunk_path

    ## mimic the in-process interface:
    ids_and_clean_visible = ids_and_clean_visible_from_streamcorpus_chunk_path(
        args.corpus)
    logger.info('gathered %d texts', len(ids_and_clean_visible))

    def format_result(result):
        score, soft_selector_phrase, matching_texts = result
        return '%.6f\t%d texts say:\t%s\t%s' % \
            (score, len(matching_texts), soft_selector_phrase.encode('utf8'),
             args.show_ids and repr(matching_texts) or '')

    if args.scan_window_size:
        best = find_soft_selectors(
            ids_and_clean_visible,
            start_num_tokens=args.num_tokens,
            max_num_tokens=args.max_num_tokens,
            filtered_punctuation=args.filter_punctuation)
        if not best:
            print('failed to find a best result!')
        else:
            print('found a best result:')
            print('\n'.join(map(format_result, best)))

    else:
        results = find_soft_selectors_at_n(
            ids_and_clean_visible, args.num_tokens, args.filter_punctuation)
        print('\n'.join(map(format_result, results)))
Exemple #20
0
def main():
    p = argparse.ArgumentParser()
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])

    config = yakonfig.get_global_config()

    class Empty(object): pass
    e = Empty()
    e.spec = dict(config=config)
    worker(e)
Exemple #21
0
def main():
    parser = argparse.ArgumentParser()
    action = Actions()
    action.add_arguments(parser)
    modules = [yakonfig]
    if dblogger:
        modules += [dblogger]
    modules += [kvlayer]
    args = yakonfig.parse_args(parser, modules)
    action.main(args)
Exemple #22
0
def main():
    parser = argparse.ArgumentParser(
        description='run a rejester work-handling daemon')
    args_run_worker(parser)
    args = yakonfig.parse_args(parser, [yakonfig, rejester])
    gconfig = yakonfig.get_global_config()
    yakonfig.clear_global_config()
    if args.foreground:
        go(gconfig, args)
    else:
        fork_worker(gconfig, args)
Exemple #23
0
def main():
    parser = argparse.ArgumentParser(
        description='Run kvlayer performance tests on a single backend.',
        conflict_handler='resolve')
    parser.add_argument('--num-workers', action='append', default=[], type=int)
    parser.add_argument('--item-size', action='append', default=[], type=int, 
                        help='size of the items to push in the large writes test, '
                        'defaults to maximum size per record in thrift RPC server '
                        'example, i.e. 15MB minus a bit of overhead')
    parser.add_argument('--num-items-per-batch', action='append', default=[], type=int, 
                        help='number of items per batch in the large writes test, '
                        'defaults to 1')
    parser.add_argument('--num-batches', default=10, type=int, 
                        help='number of batches in the large writes test, '
                        'defaults to 10')
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--shutdown-proxies', action='store_true')
    parser.add_argument('--out', default=None, help='file to append results to')
    modules = [yakonfig]
    if dblogger:
        modules.append(dblogger)
    modules.append(kvlayer)
    args = yakonfig.parse_args(parser, modules)

    if args.out:
        out = open(args.out, 'a')
    else:
        out = sys.stdout

    if not args.item_size:
        args.item_size = [fifteen_MB_minus_overhead]
    if not args.num_workers:
        args.num_workers = [1]
    if not args.num_items_per_batch:
        args.num_items_per_batch = [1]

    # return code for sys.exit()
    rc = 0
    for num_workers in args.num_workers:
        for num_items_per_batch in args.num_items_per_batch:
            for item_size in args.item_size:
                rc = run_perftests(
                    num_workers=num_workers,
                    item_size=item_size,
                    num_items_per_batch=num_items_per_batch,
                    num_batches=args.num_batches,
                    profile=args.profile,
                    out=out)

    if args.shutdown_proxies:
        # special feature of CBOR RPC proxy, really for testing only!
        client = kvlayer.client()
        client.shutdown_proxies()
    return rc
Exemple #24
0
def main():
    conf = Autoconfig(an_object)
    parser = argparse.ArgumentParser()
    args = yakonfig.parse_args(parser, [conf])
    config = yakonfig.get_global_config()
    print "The global configuration:"
    print config
    print
    obj = conf(config)
    print "The object:"
    print obj
Exemple #25
0
def main():
    conf = Autoconfig(an_object)
    parser = argparse.ArgumentParser()
    args = yakonfig.parse_args(parser, [conf])
    config = yakonfig.get_global_config()
    print "The global configuration:"
    print config
    print
    obj = conf(config)
    print "The object:"
    print obj
Exemple #26
0
def main():
    parser = argparse.ArgumentParser(__doc__, conflict_handler='resolve')
    parser.add_argument('run_file_path', help='path to run file to score.')
    parser.add_argument('scored_run_file_output_path',
                        help='path to file to create with scores inserted'
                        'into run file.')
    parser.add_argument('--overwrite',
                        action='store_true',
                        default=False,
                        help='overwrite any existing run file.')
    parser.add_argument('--verbose',
                        action='store_true',
                        default=False,
                        help='display verbose log messages.')
    parser.add_argument('--scorer',
                        action='append',
                        default=[],
                        dest='scorers',
                        help='names of scorer functions to run;'
                        ' if none are provided, it runs all of them')

    modules = [yakonfig, kvlayer]
    args = yakonfig.parse_args(parser, modules)

    if os.path.exists(args.scored_run_file_output_path):
        if args.overwrite:
            os.remove(args.scored_run_file_output_path)
        else:
            sys.exit('%r already exists' % args.scored_run_file_output_path)

    if args.verbose:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(level=level)

    kvl = kvlayer.client()
    label_store = LabelStore(kvl)

    run = load_run(args.run_file_path)

    if len(args.scorers) == 0:
        args.scorers = available_scorers.keys()

    for scorer_name in args.scorers:
        scorer = available_scorers.get(scorer_name)
        logger.info('running %s', scorer_name)
        # this modifies the run['scores'] object itself
        scorer(run, label_store)

    print(format_scores(run))

    open(args.scored_run_file_output_path, 'wb').\
        write(json.dumps(run, indent=4))
Exemple #27
0
def main():
    '''Main command-line entry point.'''
    parser = argparse.ArgumentParser(
        description='run a coordinate work-handling daemon')
    args_run_worker(parser)
    args = yakonfig.parse_args(parser, [yakonfig, coordinate])
    gconfig = yakonfig.get_global_config()
    yakonfig.clear_global_config()
    if args.foreground:
        go(gconfig, args)
    else:
        fork_worker(gconfig, args)
Exemple #28
0
def main():
    parser = argparse.ArgumentParser('test tool for checking that we can load '
                                     'the truth data as distributed by NIST for '
                                     'TREC 2015')
    parser.add_argument('truth_data_path', help='path to truth data file')
    modules = [yakonfig, kvlayer]
    args = yakonfig.parse_args(parser, modules)
    logging.basicConfig(level=logging.DEBUG)
    kvl = kvlayer.client()
    label_store = LabelStore(kvl)
    parse_truth_data(label_store, args.truth_data_path)
    logger.debug('Done!  The truth data was loaded into this kvlayer backend: %r',
                 json.dumps(yakonfig.get_global_config('kvlayer'), indent=4,
                            sort_keys=True))
Exemple #29
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        description='process a sequence of stream items',
        usage='streamcorpus_pipeline --config config.yaml --input file.in')
    parser.add_argument('-i', '--input', action='append', 
                        help='file paths to input instead of reading from stdin')
    parser.add_argument('--in-glob', action='append', default=[], help='path glob specifying input files')
    parser.add_argument('--third-dir-path', help='path to third-party tools directory')
    parser.add_argument('--tmp-dir-path', help='path to temporary directory for scratch files, can be large')

    modules = [yakonfig, kvlayer, dblogger, streamcorpus_pipeline]
    args = yakonfig.parse_args(parser, modules)
    config = yakonfig.get_global_config()

    ## this modifies the global config, passed by reference
    instantiate_config(config)

    input_paths = []
    if args.in_glob:
        for pattern in args.in_glob:
            input_paths.extend(glob.glob(pattern))
    if args.input:
        if '-' in args.input:
            if args.in_glob:
                sys.exit('cannot use "-i -" and --in-glob together')
            if len(args.input) > 1:
                sys.exit('cannot use "-i -" with multiple inputs')
            input_paths = sys.stdin
        else:
            input_paths.extend(args.input)

    scp_config = config['streamcorpus_pipeline']
    stages = PipelineStages()
    if 'external_stages_path' in scp_config:
        stages.load_external_stages(scp_config['external_stages_path'])
    if 'external_stages_modules' in scp_config:
        for mod in scp_config['external_stages_modules']:
            stages.load_module_stages(mod)
    factory = PipelineFactory(stages)
    pipeline = factory(scp_config)

    for i_str in input_paths:
        work_unit = SimpleWorkUnit(i_str.strip())
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

    ## explicitly call cleanup, which is idempotent
    pipeline.cleanup()
def main():
    parser = argparse.ArgumentParser(
        description='create rejester jobs to load elasticsearch',
        conflict_handler='resolve')
    parser.add_argument('--source', action='append',
                        help='source strings to consider')
    parser.add_argument('--work-spec-name', '-W', metavar='NAME',
                        default='elasticsearch',
                        help='name of rejester work spec')
    args = yakonfig.parse_args(parser, [yakonfig, rejester, kvlayer, dblogger])

    task_master = rejester.TaskMaster(yakonfig.get_global_config('rejester'))
    kvl = kvlayer.client()
    make_rejester_jobs(task_master, kvl, args.source,  args.work_spec_name)
def main():
    parser = argparse.ArgumentParser(
        conflict_handler='resolve',
        description='process entire directories using streamcorpus_pipeline')
    parser.add_argument('directories',
                        nargs='+',
                        metavar='directory',
                        help='directory name(s) to process')
    args = yakonfig.parse_args(parser, [
        yakonfig, rejester, kvlayer, dblogger, streamcorpus_pipeline,
        DirectoryConfig
    ])
    gconfig = yakonfig.get_global_config()
    scdconfig = gconfig['streamcorpus_directory']

    work_spec = {
        'name': scdconfig.get('name', 'streamcorpus_directory'),
        'desc': 'read files from a directory',
        'min_gb': 8,
        'config': gconfig,
        'module': 'streamcorpus_pipeline._rejester',
        'run_function': 'rejester_run_function',
        'terminate_function': 'rejester_terminate_function',
    }

    def get_filenames():
        for d in args.directories:
            if scdconfig['mode'] == 'files':
                yield d
            elif scdconfig['mode'] == 'file-lists':
                with open(d, 'r') as f:
                    for line in f:
                        yield line.strip()
            elif scdconfig['mode'] == 'directories':
                for dirpath, dirnames, filenames in os.walk(d):
                    for filename in filenames:
                        yield os.path.abspath(os.path.join(dirpath, filename))

    work_units = {filename: {'start_count': 0} for filename in get_filenames()}

    if scdconfig['engine'] == 'rejester':
        tm = rejester.TaskMaster(gconfig['rejester'])
        tm.update_bundle(work_spec, work_units)
    elif scdconfig['engine'] == 'standalone':
        for k, v in work_units.iteritems():
            u = SimpleWorkUnit(k)
            u.spec = work_spec
            u.data = v
            rejester_run_function(u)
Exemple #32
0
def main():
    parser = argparse.ArgumentParser(
        'Command line interface to the office TREC DD jig.',
        usage=usage,
        conflict_handler='resolve')
    parser.add_argument('command', help='must be "load", "init", "start", "step", or "stop"')
    parser.add_argument('args', help='input for given command',
                        nargs=argparse.REMAINDER)
    modules = [yakonfig, kvlayer, Harness]
    args = yakonfig.parse_args(parser, modules)

    logging.basicConfig(level=logging.DEBUG)

    if args.command not in set(['load', 'init', 'start', 'step', 'stop']):
        sys.exit('The only valid commands are "load", "init", "start", "step", and "stop".')

    kvl = kvlayer.client()
    label_store = LabelStore(kvl)
    config = yakonfig.get_global_config('harness')
    harness = Harness(config, kvl, label_store)

    if args.command == 'load':
        if not config.get('truth_data_path'):
            sys.exit('Must provide --truth-data-path as an argument')
        if not os.path.exists(config['truth_data_path']):
            sys.exit('%r does not exist' % config['truth_data_path'])
        parse_truth_data(label_store, config['truth_data_path'])
        logger.info('Done!  The truth data was loaded into this '
                     'kvlayer backend:\n%s',
                    json.dumps(yakonfig.get_global_config('kvlayer'),
                               indent=4, sort_keys=True))

    elif args.command == 'init':
        response = harness.init()
        print(json.dumps(response))

    elif args.command == 'start':
        response = harness.start()
        print(json.dumps(response))

    elif args.command == 'stop':
        response = harness.stop(args.args[0])
        print(json.dumps(response))

    elif args.command == 'step':
        parts = args.args
        topic_id = parts.pop(0)
        feedback = harness.step(topic_id, parts)
        print(json.dumps(feedback))
Exemple #33
0
def get_application():
    config = Config()
    p = argparse.ArgumentParser(description='Run DossierStack web services.')
    web.add_cli_arguments(p)
    args = yakonfig.parse_args(p, [dblogger, config, kvlayer, yakonfig])

    bottle.debug(True)
    app = (web.WebBuilder().set_config(config).enable_cors().inject(
        'tfidf', lambda: config.tfidf).inject(
            'google', lambda: config.google).add_routes(models_app).add_filter(
                'already_labeled', same_subfolder).add_search_engine(
                    'similar',
                    similar).add_search_engine('dissimilar',
                                               dissimilar).get_app())
    return args, app
def get_application():
    config = Config()
    p = argparse.ArgumentParser(
        description='Run Memex Dossier Stack web services.')
    web.add_cli_arguments(p)
    args = yakonfig.parse_args(p, [dblogger, config, kvlayer, yakonfig])

    bottle.debug(True)
    app = (web.WebBuilder().set_config(config).enable_cors().inject(
        'tfidf',
        lambda: config.tfidf).inject('google', lambda: config.google).inject(
            'akagraph', lambda: config.akagraph).inject(
                'akagraph_replicas',
                lambda: config.akagraph_replicas).add_routes(
                    models_app).get_app())
    return args, app
def main():
    parser = argparse.ArgumentParser(conflict_handler='resolve',
        description='process entire directories using streamcorpus_pipeline')
    parser.add_argument('directories', nargs='+', metavar='directory',
                        help='directory name(s) to process')
    args = yakonfig.parse_args(parser, [yakonfig, rejester, kvlayer, dblogger,
                                        streamcorpus_pipeline, DirectoryConfig])
    gconfig = yakonfig.get_global_config()
    scdconfig = gconfig['streamcorpus_directory']
    
    work_spec = {
        'name': scdconfig.get('name', 'streamcorpus_directory'),
        'desc': 'read files from a directory',
        'min_gb': 8,
        'config': gconfig,
        'module': 'streamcorpus_pipeline._rejester',
        'run_function': 'rejester_run_function',
        'terminate_function': 'rejester_terminate_function',
    }

    def get_filenames():
        for d in args.directories:
            if scdconfig['mode'] == 'files':
                yield d
            elif scdconfig['mode'] == 'file-lists':
                with open(d, 'r') as f:
                    for line in f:
                        yield line.strip()
            elif scdconfig['mode'] == 'directories':
                for dirpath, dirnames, filenames in os.walk(d):
                    for filename in filenames:
                        yield os.path.abspath(os.path.join(dirpath, filename))

    work_units = { filename: { 'start_count': 0 }
                   for filename in get_filenames() }

    if scdconfig['engine'] == 'rejester':
        tm = rejester.TaskMaster(gconfig['rejester'])
        tm.update_bundle(work_spec, work_units)
    elif scdconfig['engine'] == 'standalone':
        for k,v in work_units.iteritems():
            u = SimpleWorkUnit(k)
            u.spec = work_spec
            u.data = v
            rejester_run_function(u)
Exemple #36
0
def get_application():
    config = Config()
    p = argparse.ArgumentParser(description='Run DossierStack web services.')
    web.add_cli_arguments(p)
    args = yakonfig.parse_args(p, [dblogger, config, kvlayer, yakonfig])

    bottle.debug(True)
    app = (web.WebBuilder()
           .set_config(config)
           .enable_cors()
           .inject('tfidf', lambda: config.tfidf)
           .inject('google', lambda: config.google)
           .add_routes(models_app)
           .add_filter('already_labeled', same_subfolder)
           .add_search_engine('similar', similar)
           .add_search_engine('dissimilar', dissimilar)
           .get_app())
    return args, app
Exemple #37
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--storage_type', default='redis')
    parser.add_argument('--storage_address', nargs='?', dest='storage_addresses')
    modules = [yakonfig, kvlayer]
    args = yakonfig.parse_args(parser, modules)
    config = yakonfig.get_global_config()

    if not args.storage_addresses:
        args.storage_addresses = ['redis.diffeo.com:6379']

    config['kvlayer'].update({
        'storage_type': args.storage_type,
        'storage_addresses': args.storage_addresses,
        })
    client = kvlayer.client()
    scan_batch_size(client)
Exemple #38
0
def main():
    '''Run the random recommender system on a sequence of topics.
    '''
    description = (
        'A baseline recommender system that uses the truth data to'
        ' create output that has perfect recall and would also have'
        ' perfect precision if you ignore subtopic diversity/novelty.'
        ' This generates output directly from the truth data and'
        ' randomly shuffles the truth data per topic, so that'
        ' the ordering of passages does not attempt to optimize any'
        ' particular quality metric.')
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--overwrite', action='store_true')
    args = yakonfig.parse_args(parser, [yakonfig])

    logging.basicConfig(level=logging.DEBUG)

    config = yakonfig.get_global_config('harness')
    batch_size = config.get('batch_size', 5)
    run_file_path = config['run_file_path']
    if os.path.exists(run_file_path):
        if args.overwrite:
            os.remove(run_file_path)
        else:
            sys.exit('%r already exists' % run_file_path)

    kvl_config = {
        'storage_type': 'local',
        'namespace': 'test',
        'app_name': 'test'
    }
    kvl = kvlayer.client(kvl_config)
    label_store = LabelStore(kvl)

    parse_truth_data(label_store, config['truth_data_path'])

    # Set up the system
    doc_store = make_doc_store(label_store)
    system = RandomSystem(doc_store)
    ambassador = HarnessAmbassadorCLI(system, args.config, batch_size)
    ambassador.run()
def main():
    '''Run the random recommender system on a sequence of topics.
    '''
    description = ('A baseline recommender system that uses the truth data to'
                   ' create output that has perfect recall and would also have'
                   ' perfect precision if you ignore subtopic diversity/novelty.'
                   ' This generates output directly from the truth data and'
                   ' randomly shuffles the truth data per topic, so that'
                   ' the ordering of passages does not attempt to optimize any'
                   ' particular quality metric.')
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--overwrite', action='store_true')
    args = yakonfig.parse_args(parser, [yakonfig])

    logging.basicConfig(level=logging.DEBUG)

    config = yakonfig.get_global_config('harness')
    batch_size = config.get('batch_size', 5)
    run_file_path = config['run_file_path']
    if os.path.exists(run_file_path):
        if args.overwrite:
            os.remove(run_file_path)
        else:
            sys.exit('%r already exists' % run_file_path)

    kvl_config = {'storage_type': 'local',
                  'namespace': 'test',
                  'app_name': 'test'}
    kvl = kvlayer.client(kvl_config)
    label_store = LabelStore(kvl)

    parse_truth_data(label_store, config['truth_data_path'])

    # Set up the system
    doc_store = make_doc_store(label_store)
    system = RandomSystem(doc_store)
    ambassador = HarnessAmbassadorCLI(system, args.config, batch_size)
    ambassador.run()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-n', default=2, type=int,
        help='the n of the ngrams')
    args = yakonfig.parse_args(parser, [yakonfig, dblogger])
    n = args.n

    ## note, these are initialized to be lower case
    corpora = initialize_corpora()

    # which = ['male', 'female', 'english']
    which =['english']

    for corpus_name in which:
        corpus = corpora[corpus_name]
        stats = compute_statistics(corpus, n)

    ## still needs to be saved somewhere

    ## also need to remember how default dict works in order to
    ## hapax legomenoma it

    import pdb; pdb.set_trace()
Exemple #41
0
def main():
    from memex_dossier.models.web.config import Config

    p = argparse.ArgumentParser('Ingest AKA records into ElasticSearch.')
    p.add_argument('--k-replicas', default=1, type=int)
    p.add_argument('--buffer-size', default=100, type=int)
    p.add_argument('--delete', action='store_true', default=False)
    p.add_argument('--parent')
    p.add_argument('--query')
    p.add_argument('--make-pairs', help='path to a csv file to create')
    p.add_argument('--input-format', default=None)
    p.add_argument('--ingest',
                   nargs='+',
                   help='record files in gzipped CBOR or an ETL format.')
    p.add_argument('--analyze',
                   action='store_true',
                   default=False,
                   help='output analysis of all clusters')
    p.add_argument('--limit',
                   default=None,
                   type=int,
                   help='number of records to process.')
    config = Config()
    args = yakonfig.parse_args(p, [dblogger, config, kvlayer, yakonfig])

    logging.basicConfig(level=logging.DEBUG)

    aka = config.akagraph
    aka.buffer_size = args.buffer_size

    if args.parent:
        data = [
            aka.get_parent(AKANode(unicode(args.parent), i))
            for i in aka.replica_list
        ]
        logger.info(data)
        sys.exit()

    if args.query:
        cluster = []
        ccs = aka.find_connected_component(args.query)
        for rec, confidence in ccs:
            rec['confidence'] = confidence
            cluster.append(rec)
        if not args.make_pairs:
            logger.info(json.dumps(cluster, indent=4, sort_keys=True))
        else:
            assert args.make_pairs.endswith('.csv'),\
                '--make-pairs must end in ".csv"'

            # make the four-column Memex eval format
            # these functions are used in the loop below

            def make_name(rec):
                vals = []
                for key in ['email', 'name', 'phone', 'bitcoin']:
                    if key not in rec: continue
                    vals.extend(rec[key])
                return u','.join(vals)

            def domain(rec):
                parts = rec['url'].split('/')
                if len(parts) < 3:
                    logger.debug(parts)
                    return parts[0]
                return parts[2]

            def tld(rec):
                return domain(rec).split('.')[-1]

            def bad_username(un):
                if un[0] in set(string.digits): return True
                for bad_word in [
                        'Home', 'Wish', 'Cart', 'Shopping', 'Account', 'User'
                ]:
                    if bad_word in un: return True
                if len(set(un) - set(string.digits + string.letters)) > 0:
                    return True
                return False

            with open(args.make_pairs, 'ab') as fh:
                writer = csv.writer(fh)
                # consider all pairs
                for i in range(len(cluster)):
                    for j in range(i, len(cluster)):
                        r1 = cluster[i]
                        r2 = cluster[j]
                        if 'bogus' in r1['url']: continue
                        if 'bogus' in r2['url']: continue
                        c1 = r1['confidence']
                        c2 = r2['confidence']
                        if not (1 <= c1 or 1 <= c2):
                            continue
                        score = min(r1['confidence'], r2['confidence'])
                        pair_key = ','.join(
                            [args.query, domain(r1),
                             domain(r2)])
                        tld_key = ','.join(sorted([tld(r1), tld(r2)]))
                        identifier_types = [
                            'email', 'username'
                        ]  #### IGNORE phone and bitconin and name
                        for k1 in identifier_types:
                            if k1 not in r1: continue
                            for k2 in identifier_types:
                                if k2 not in r2: continue
                                identifier_type_key = ','.join(sorted([k1,
                                                                       k2]))
                                for n1 in r1[k1]:
                                    for n2 in r2[k2]:
                                        if k1 == 'email':
                                            n1 = n1.split('@')[0]
                                        if k2 == 'email':
                                            n2 = n2.split(
                                                '@'
                                            )[0]  ### they only want to *see* a username-like string
                                        if bad_username(n1): continue
                                        if bad_username(n2): continue
                                        row = (
                                            r1['url'],
                                            n1.encode('utf8'),
                                            r2['url'],
                                            n2.encode('utf8'),
                                            score,
                                            identifier_type_key,
                                            pair_key,
                                            tld_key,
                                        )
                                        writer.writerow(row)
        sys.exit()

    if args.delete:
        aka.delete_index()
        sys.exit()

    if args.input_format:
        loader = get_etl_transforms(args.input_format)
    else:
        loader = None

    if args.ingest:
        logger.debug('running ingest with loader=%r: %r', loader, aka)
        run_ingest(args, loader, aka)

    if args.analyze:
        stats = aka.analyze_clusters(limit=args.limit)
        print(json.dumps(stats, indent=4, sort_keys=True))
        sys.exit()
Exemple #42
0
def main():
    p = argparse.ArgumentParser(description="Interact with the Dossier feature collection store.")
    app = App()
    app.add_arguments(p)
    args = yakonfig.parse_args(p, [kvlayer, yakonfig, Store])
    app.main(args)
Exemple #43
0
            # else:
            #     ## phone numbers
            #     n = list()
            #     for idx in xrange(10):
            #         n.append(random.randint(0,9))
            #     num = (n[0], n[1], n[2], n[3], n[4], n[5], n[6], n[7], n[8], n[9])

            #     if random.random() < 0.5:
            #         word = '(%d%d%d) %d%d%d-%d%d%d%d' % num
            #     else:
            #         word = '%d%d%d-%d%d%d-%d%d%d%d' % num

            if not word in examples:
                examples.add(word)
                break

    return list(examples)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('N', help='desired corpus size', type=int)
    args = yakonfig.parse_args(parser, [yakonfig, dblogger])
    N = args.N

    examples = create_corpus(N)

    with open('negative-training-data-%d.txt' % N, 'w') as f:
        for word in examples:
            f.write(word + '\n')
Exemple #44
0
def toy_main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--thing')
    modules = [yakonfig]
    args = yakonfig.parse_args(parser, modules)
    config = yakonfig.get_global_config()