def test_yakonfig_cli(): parser = argparse.ArgumentParser() yakonfig.parse_args(parser, [yakonfig], args=[]) try: c = yakonfig.get_global_config() assert 'yakonfig' in c finally: yakonfig.clear_global_config()
def test_cli_good(): parser = argparse.ArgumentParser() yakonfig.parse_args(parser, [ConfigurableArgs()], args=['--key', 'v']) try: c = yakonfig.get_global_config() assert 'config' in c assert 'k' in c['config'] assert c['config']['k'] == 'v' finally: yakonfig.clear_global_config()
def test_cli_file(request): yaml = str(request.fspath.dirpath('argconfig.yaml')) parser = argparse.ArgumentParser() yakonfig.parse_args(parser, [yakonfig, ConfigurableArgs()], args=['-c', yaml]) try: c = yakonfig.get_global_config() assert 'config' in c assert 'k' in c['config'] assert c['config']['k'] == 'x' # from the file finally: yakonfig.clear_global_config()
def test_cli_overlay(request): # config.k is in the default, *and* the config file, *and* the command line yaml = str(request.fspath.dirpath('argconfig.yaml')) parser = argparse.ArgumentParser() yakonfig.parse_args(parser, [yakonfig, ConfigurableArgs()], args=['-c', yaml, '-k', 'y']) try: c = yakonfig.get_global_config() assert 'config' in c assert 'k' in c['config'] assert c['config']['k'] == 'y' # from the command line finally: yakonfig.clear_global_config()
def main(): '''Launch the AMQP worker.''' filters = { 'already_labeled': already_labeled, 'geotime': geotime, } worker = AMQPWorker(filters) parser = argparse.ArgumentParser() modules = [yakonfig, kvlayer, dblogger, coordinate, worker] yakonfig.parse_args(parser, modules) worker.task_master = coordinate.TaskMaster( yakonfig.get_global_config(coordinate.config_name)) worker.start()
def main(): ap = argparse.ArgumentParser() ap.add_argument('--host', default=None, # NOT -h, that's help help='host that coordinated will listen on, ' '0.0.0.0 for any input interface') ap.add_argument('--port', '-p', type=int, default=None, help='port number that coordinated will listen on') ap.add_argument('--pid', default=None, help='file to write pid to') ap.add_argument('--snapshot-dir', default=None, help='direcotry to write snapshots to') ap.add_argument('--httpd', default=None, help='ip:port or :port to serve http info on') if yappi is not None: ap.add_argument('--yappi', default=None, help='file to write yappi profiling to. will be suffied by {timestamp}.txt') args = yakonfig.parse_args(ap, [yakonfig, dblogger, coordinate]) if args.pid: with open(args.pid, 'w') as f: f.write(str(os.getpid())) if args.snapshot_dir is not None: cjqconfig = yakonfig.get_global_config('coordinate', 'job_queue') # (This modifies the global configuration in place) cjqconfig['snapshot_path_format'] = os.path.join( args.snapshot_dir, 'snapshot_{timestamp}') if (yappi is not None) and args.yappi: yappi.start() yt = threading.Thread(target=yappi_logger, args=(args.yappi,)) yt.daemon = True yt.start() daemon = CoordinateServer(host=args.host, port=args.port, httpd=args.httpd) daemon.run()
def main(): p = argparse.ArgumentParser( 'simple debugging tool for watching the linker and OpenQuery') p.add_argument('action', help='either `run` or `cache` or `delete`') p.add_argument('folder', help='folder name') p.add_argument('subfolder', help='subfolder name') args = yakonfig.parse_args(p, [kvlayer, yakonfig]) config = yakonfig.get_global_config() key = cbor.dumps( (args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_'))) if args.action == 'run': web_conf = Config() with yakonfig.defaulted_config([kvlayer, dblogger, web_conf], config=config): traverse_extract_fetch(web_conf, key, stop_after_extraction=True) elif args.action == 'delete': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str, )}) kvlclient.delete('openquery', (key, )) print('deleted %r' % key) elif args.action == 'cache': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str, )}) count = 0 for rec in kvlclient.scan('openquery'): count += 1 if rec[0][0] == key: print rec print('%d cached queries' % count)
def main(): p = argparse.ArgumentParser( description='Specific utilities for working with the ad corpus.') app = App() app.add_arguments(p) args = yakonfig.parse_args(p, [kvlayer, yakonfig]) app.main(args)
def main(): p = argparse.ArgumentParser( description='Interact with DossierStack truth data.') app = App() app.add_arguments(p) args = yakonfig.parse_args(p, [kvlayer, yakonfig]) app.main(args)
def default_app(): config = Config() p = argparse.ArgumentParser(description='Run DossierStack web services.') add_cli_arguments(p) args = yakonfig.parse_args(p, [config, dblogger, kvlayer, yakonfig]) app = WebBuilder().set_config(config).enable_cors().get_app() return args, app
def main(): p = argparse.ArgumentParser( description='Interact with the Dossier feature collection store.') app = App() app.add_arguments(p) args = yakonfig.parse_args(p, [kvlayer, yakonfig, Store]) app.main(args)
def test_check_toplevel_args(): '''check_config_toplevel() should work in check_config() implementation''' parser = argparse.ArgumentParser() yakonfig.parse_args(parser, [ConfigurableArgs(), Dependent], ['--key', 'k']) try: config = yakonfig.get_global_config() assert sorted(iterkeys(config)) == ['config', 'dependent'] assert config['config']['k'] == 'k' finally: yakonfig.clear_global_config() parser = argparse.ArgumentParser() with pytest.raises(SystemExit): yakonfig.parse_args(parser, [ConfigurableArgs(), Dependent], ['--key', 'key']) yakonfig.clear_global_config()
def main(): p = argparse.ArgumentParser( description='Utilities for generating FCs from artifacts.') app = App() app.add_arguments(p) args = yakonfig.parse_args(p, [kvlayer, yakonfig]) app.main(args)
def main(): parser = argparse.ArgumentParser() app = CoordinateC() app.add_arguments(parser) args = yakonfig.parse_args(parser, [yakonfig, dblogger, coordinate]) app.main(args) sys.exit(app.exitcode)
def main(): p = argparse.ArgumentParser('simple debugging tool for watching the linker and OpenQuery') p.add_argument('action', help='either `run` or `cache` or `delete`') p.add_argument('folder', help='folder name') p.add_argument('subfolder', help='subfolder name') args = yakonfig.parse_args(p, [kvlayer, yakonfig]) config = yakonfig.get_global_config() key = cbor.dumps((args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_'))) if args.action == 'run': web_conf = Config() with yakonfig.defaulted_config([kvlayer, dblogger, web_conf], config=config): traverse_extract_fetch(web_conf, key, stop_after_extraction=True) elif args.action == 'delete': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str,)}) kvlclient.delete('openquery', (key,)) print('deleted %r' % key) elif args.action == 'cache': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str,)}) count = 0 for rec in kvlclient.scan('openquery'): count += 1 if rec[0][0] == key: print rec print('%d cached queries' % count)
def main(options): """Run the recommender system on a sequence of topics. """ description = "System using LDA, Kmeans and Solr to optimize diversification and exploitation of different topics" parser = argparse.ArgumentParser(description=description) parser.add_argument("--overwrite", action="store_true") args = yakonfig.parse_args(parser, [yakonfig]) logging.basicConfig(level=logging.DEBUG) config = yakonfig.get_global_config("harness") batch_size = config.get("batch_size", 5) run_file_path = config["run_file_path"] if os.path.exists(run_file_path): if args.overwrite: os.remove(run_file_path) else: os.remove(run_file_path) # sys.exit('%r already exists' % run_file_path) kvl_config = {"storage_type": "local", "namespace": "test", "app_name": "test"} kvl = kvlayer.client(kvl_config) method, feedback_options, poids, id_config = options[0], options[1], options[2], options[3] print method, poids system = SearchSystem([], method, feedback_options, poids) print args.config args.config = "config" + str(id_config) + ".yaml" print args.config ambassador = HarnessAmbassadorCLI(system, args.config, batch_size) ambassador.run()
def main(): parser = argparse.ArgumentParser( 'command line tool for debugging and development') parser.add_argument('corpus', help='path to a streamcorpus.Chunk file') parser.add_argument('-n', '--num-tokens', default=6, type=int, help='the n of the ngrams; used as start_num_tokens ' 'for scanning') parser.add_argument('--max-num-tokens', default=40, type=int, help='maximum number of `n` in n-grams for scanning') parser.add_argument('--peak-score-delta', default=0.01, type=float, help='delta in score values required between first ' 'and second result to stop scanning') parser.add_argument('--scan-window-size', default=False, action='store_true', help='if set, scans from the value of -n until it ' 'finds a strongly peaked top value') parser.add_argument('--filter-punctuation', default=False, action='store_true', help='filter out punctuation; default is to not ' 'filter punctuation') parser.add_argument('--show-ids', default=False, action='store_true', help='show identifiers in diagnostic output') args = yakonfig.parse_args(parser, [yakonfig, dblogger]) ## TODO: if we start needing to load FC chunk files (instead of SI ## chunk files), this might need to be told which kind of chunk it ## is loading, and we'll need a second function along the lines of ## ids_and_clean_visible_from_streamcorpus_chunk_path ## mimic the in-process interface: ids_and_clean_visible = ids_and_clean_visible_from_streamcorpus_chunk_path( args.corpus) logger.info('gathered %d texts', len(ids_and_clean_visible)) def format_result(result): score, soft_selector_phrase, matching_texts = result return '%.6f\t%d texts say:\t%s\t%s' % \ (score, len(matching_texts), soft_selector_phrase.encode('utf8'), args.show_ids and repr(matching_texts) or '') if args.scan_window_size: best = find_soft_selectors( ids_and_clean_visible, start_num_tokens=args.num_tokens, max_num_tokens=args.max_num_tokens, filtered_punctuation=args.filter_punctuation) if not best: print('failed to find a best result!') else: print('found a best result:') print('\n'.join(map(format_result, best))) else: results = find_soft_selectors_at_n( ids_and_clean_visible, args.num_tokens, args.filter_punctuation) print('\n'.join(map(format_result, results)))
def main(): p = argparse.ArgumentParser() args = yakonfig.parse_args(p, [kvlayer, yakonfig]) config = yakonfig.get_global_config() class Empty(object): pass e = Empty() e.spec = dict(config=config) worker(e)
def main(): parser = argparse.ArgumentParser() action = Actions() action.add_arguments(parser) modules = [yakonfig] if dblogger: modules += [dblogger] modules += [kvlayer] args = yakonfig.parse_args(parser, modules) action.main(args)
def main(): parser = argparse.ArgumentParser( description='run a rejester work-handling daemon') args_run_worker(parser) args = yakonfig.parse_args(parser, [yakonfig, rejester]) gconfig = yakonfig.get_global_config() yakonfig.clear_global_config() if args.foreground: go(gconfig, args) else: fork_worker(gconfig, args)
def main(): parser = argparse.ArgumentParser( description='Run kvlayer performance tests on a single backend.', conflict_handler='resolve') parser.add_argument('--num-workers', action='append', default=[], type=int) parser.add_argument('--item-size', action='append', default=[], type=int, help='size of the items to push in the large writes test, ' 'defaults to maximum size per record in thrift RPC server ' 'example, i.e. 15MB minus a bit of overhead') parser.add_argument('--num-items-per-batch', action='append', default=[], type=int, help='number of items per batch in the large writes test, ' 'defaults to 1') parser.add_argument('--num-batches', default=10, type=int, help='number of batches in the large writes test, ' 'defaults to 10') parser.add_argument('--profile', action='store_true') parser.add_argument('--shutdown-proxies', action='store_true') parser.add_argument('--out', default=None, help='file to append results to') modules = [yakonfig] if dblogger: modules.append(dblogger) modules.append(kvlayer) args = yakonfig.parse_args(parser, modules) if args.out: out = open(args.out, 'a') else: out = sys.stdout if not args.item_size: args.item_size = [fifteen_MB_minus_overhead] if not args.num_workers: args.num_workers = [1] if not args.num_items_per_batch: args.num_items_per_batch = [1] # return code for sys.exit() rc = 0 for num_workers in args.num_workers: for num_items_per_batch in args.num_items_per_batch: for item_size in args.item_size: rc = run_perftests( num_workers=num_workers, item_size=item_size, num_items_per_batch=num_items_per_batch, num_batches=args.num_batches, profile=args.profile, out=out) if args.shutdown_proxies: # special feature of CBOR RPC proxy, really for testing only! client = kvlayer.client() client.shutdown_proxies() return rc
def main(): conf = Autoconfig(an_object) parser = argparse.ArgumentParser() args = yakonfig.parse_args(parser, [conf]) config = yakonfig.get_global_config() print "The global configuration:" print config print obj = conf(config) print "The object:" print obj
def main(): parser = argparse.ArgumentParser(__doc__, conflict_handler='resolve') parser.add_argument('run_file_path', help='path to run file to score.') parser.add_argument('scored_run_file_output_path', help='path to file to create with scores inserted' 'into run file.') parser.add_argument('--overwrite', action='store_true', default=False, help='overwrite any existing run file.') parser.add_argument('--verbose', action='store_true', default=False, help='display verbose log messages.') parser.add_argument('--scorer', action='append', default=[], dest='scorers', help='names of scorer functions to run;' ' if none are provided, it runs all of them') modules = [yakonfig, kvlayer] args = yakonfig.parse_args(parser, modules) if os.path.exists(args.scored_run_file_output_path): if args.overwrite: os.remove(args.scored_run_file_output_path) else: sys.exit('%r already exists' % args.scored_run_file_output_path) if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig(level=level) kvl = kvlayer.client() label_store = LabelStore(kvl) run = load_run(args.run_file_path) if len(args.scorers) == 0: args.scorers = available_scorers.keys() for scorer_name in args.scorers: scorer = available_scorers.get(scorer_name) logger.info('running %s', scorer_name) # this modifies the run['scores'] object itself scorer(run, label_store) print(format_scores(run)) open(args.scored_run_file_output_path, 'wb').\ write(json.dumps(run, indent=4))
def main(): '''Main command-line entry point.''' parser = argparse.ArgumentParser( description='run a coordinate work-handling daemon') args_run_worker(parser) args = yakonfig.parse_args(parser, [yakonfig, coordinate]) gconfig = yakonfig.get_global_config() yakonfig.clear_global_config() if args.foreground: go(gconfig, args) else: fork_worker(gconfig, args)
def main(): parser = argparse.ArgumentParser('test tool for checking that we can load ' 'the truth data as distributed by NIST for ' 'TREC 2015') parser.add_argument('truth_data_path', help='path to truth data file') modules = [yakonfig, kvlayer] args = yakonfig.parse_args(parser, modules) logging.basicConfig(level=logging.DEBUG) kvl = kvlayer.client() label_store = LabelStore(kvl) parse_truth_data(label_store, args.truth_data_path) logger.debug('Done! The truth data was loaded into this kvlayer backend: %r', json.dumps(yakonfig.get_global_config('kvlayer'), indent=4, sort_keys=True))
def main(): import argparse parser = argparse.ArgumentParser( description='process a sequence of stream items', usage='streamcorpus_pipeline --config config.yaml --input file.in') parser.add_argument('-i', '--input', action='append', help='file paths to input instead of reading from stdin') parser.add_argument('--in-glob', action='append', default=[], help='path glob specifying input files') parser.add_argument('--third-dir-path', help='path to third-party tools directory') parser.add_argument('--tmp-dir-path', help='path to temporary directory for scratch files, can be large') modules = [yakonfig, kvlayer, dblogger, streamcorpus_pipeline] args = yakonfig.parse_args(parser, modules) config = yakonfig.get_global_config() ## this modifies the global config, passed by reference instantiate_config(config) input_paths = [] if args.in_glob: for pattern in args.in_glob: input_paths.extend(glob.glob(pattern)) if args.input: if '-' in args.input: if args.in_glob: sys.exit('cannot use "-i -" and --in-glob together') if len(args.input) > 1: sys.exit('cannot use "-i -" with multiple inputs') input_paths = sys.stdin else: input_paths.extend(args.input) scp_config = config['streamcorpus_pipeline'] stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) for i_str in input_paths: work_unit = SimpleWorkUnit(i_str.strip()) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) ## explicitly call cleanup, which is idempotent pipeline.cleanup()
def main(): parser = argparse.ArgumentParser( description='create rejester jobs to load elasticsearch', conflict_handler='resolve') parser.add_argument('--source', action='append', help='source strings to consider') parser.add_argument('--work-spec-name', '-W', metavar='NAME', default='elasticsearch', help='name of rejester work spec') args = yakonfig.parse_args(parser, [yakonfig, rejester, kvlayer, dblogger]) task_master = rejester.TaskMaster(yakonfig.get_global_config('rejester')) kvl = kvlayer.client() make_rejester_jobs(task_master, kvl, args.source, args.work_spec_name)
def main(): parser = argparse.ArgumentParser( conflict_handler='resolve', description='process entire directories using streamcorpus_pipeline') parser.add_argument('directories', nargs='+', metavar='directory', help='directory name(s) to process') args = yakonfig.parse_args(parser, [ yakonfig, rejester, kvlayer, dblogger, streamcorpus_pipeline, DirectoryConfig ]) gconfig = yakonfig.get_global_config() scdconfig = gconfig['streamcorpus_directory'] work_spec = { 'name': scdconfig.get('name', 'streamcorpus_directory'), 'desc': 'read files from a directory', 'min_gb': 8, 'config': gconfig, 'module': 'streamcorpus_pipeline._rejester', 'run_function': 'rejester_run_function', 'terminate_function': 'rejester_terminate_function', } def get_filenames(): for d in args.directories: if scdconfig['mode'] == 'files': yield d elif scdconfig['mode'] == 'file-lists': with open(d, 'r') as f: for line in f: yield line.strip() elif scdconfig['mode'] == 'directories': for dirpath, dirnames, filenames in os.walk(d): for filename in filenames: yield os.path.abspath(os.path.join(dirpath, filename)) work_units = {filename: {'start_count': 0} for filename in get_filenames()} if scdconfig['engine'] == 'rejester': tm = rejester.TaskMaster(gconfig['rejester']) tm.update_bundle(work_spec, work_units) elif scdconfig['engine'] == 'standalone': for k, v in work_units.iteritems(): u = SimpleWorkUnit(k) u.spec = work_spec u.data = v rejester_run_function(u)
def main(): parser = argparse.ArgumentParser( 'Command line interface to the office TREC DD jig.', usage=usage, conflict_handler='resolve') parser.add_argument('command', help='must be "load", "init", "start", "step", or "stop"') parser.add_argument('args', help='input for given command', nargs=argparse.REMAINDER) modules = [yakonfig, kvlayer, Harness] args = yakonfig.parse_args(parser, modules) logging.basicConfig(level=logging.DEBUG) if args.command not in set(['load', 'init', 'start', 'step', 'stop']): sys.exit('The only valid commands are "load", "init", "start", "step", and "stop".') kvl = kvlayer.client() label_store = LabelStore(kvl) config = yakonfig.get_global_config('harness') harness = Harness(config, kvl, label_store) if args.command == 'load': if not config.get('truth_data_path'): sys.exit('Must provide --truth-data-path as an argument') if not os.path.exists(config['truth_data_path']): sys.exit('%r does not exist' % config['truth_data_path']) parse_truth_data(label_store, config['truth_data_path']) logger.info('Done! The truth data was loaded into this ' 'kvlayer backend:\n%s', json.dumps(yakonfig.get_global_config('kvlayer'), indent=4, sort_keys=True)) elif args.command == 'init': response = harness.init() print(json.dumps(response)) elif args.command == 'start': response = harness.start() print(json.dumps(response)) elif args.command == 'stop': response = harness.stop(args.args[0]) print(json.dumps(response)) elif args.command == 'step': parts = args.args topic_id = parts.pop(0) feedback = harness.step(topic_id, parts) print(json.dumps(feedback))
def get_application(): config = Config() p = argparse.ArgumentParser(description='Run DossierStack web services.') web.add_cli_arguments(p) args = yakonfig.parse_args(p, [dblogger, config, kvlayer, yakonfig]) bottle.debug(True) app = (web.WebBuilder().set_config(config).enable_cors().inject( 'tfidf', lambda: config.tfidf).inject( 'google', lambda: config.google).add_routes(models_app).add_filter( 'already_labeled', same_subfolder).add_search_engine( 'similar', similar).add_search_engine('dissimilar', dissimilar).get_app()) return args, app
def get_application(): config = Config() p = argparse.ArgumentParser( description='Run Memex Dossier Stack web services.') web.add_cli_arguments(p) args = yakonfig.parse_args(p, [dblogger, config, kvlayer, yakonfig]) bottle.debug(True) app = (web.WebBuilder().set_config(config).enable_cors().inject( 'tfidf', lambda: config.tfidf).inject('google', lambda: config.google).inject( 'akagraph', lambda: config.akagraph).inject( 'akagraph_replicas', lambda: config.akagraph_replicas).add_routes( models_app).get_app()) return args, app
def main(): parser = argparse.ArgumentParser(conflict_handler='resolve', description='process entire directories using streamcorpus_pipeline') parser.add_argument('directories', nargs='+', metavar='directory', help='directory name(s) to process') args = yakonfig.parse_args(parser, [yakonfig, rejester, kvlayer, dblogger, streamcorpus_pipeline, DirectoryConfig]) gconfig = yakonfig.get_global_config() scdconfig = gconfig['streamcorpus_directory'] work_spec = { 'name': scdconfig.get('name', 'streamcorpus_directory'), 'desc': 'read files from a directory', 'min_gb': 8, 'config': gconfig, 'module': 'streamcorpus_pipeline._rejester', 'run_function': 'rejester_run_function', 'terminate_function': 'rejester_terminate_function', } def get_filenames(): for d in args.directories: if scdconfig['mode'] == 'files': yield d elif scdconfig['mode'] == 'file-lists': with open(d, 'r') as f: for line in f: yield line.strip() elif scdconfig['mode'] == 'directories': for dirpath, dirnames, filenames in os.walk(d): for filename in filenames: yield os.path.abspath(os.path.join(dirpath, filename)) work_units = { filename: { 'start_count': 0 } for filename in get_filenames() } if scdconfig['engine'] == 'rejester': tm = rejester.TaskMaster(gconfig['rejester']) tm.update_bundle(work_spec, work_units) elif scdconfig['engine'] == 'standalone': for k,v in work_units.iteritems(): u = SimpleWorkUnit(k) u.spec = work_spec u.data = v rejester_run_function(u)
def get_application(): config = Config() p = argparse.ArgumentParser(description='Run DossierStack web services.') web.add_cli_arguments(p) args = yakonfig.parse_args(p, [dblogger, config, kvlayer, yakonfig]) bottle.debug(True) app = (web.WebBuilder() .set_config(config) .enable_cors() .inject('tfidf', lambda: config.tfidf) .inject('google', lambda: config.google) .add_routes(models_app) .add_filter('already_labeled', same_subfolder) .add_search_engine('similar', similar) .add_search_engine('dissimilar', dissimilar) .get_app()) return args, app
def main(): import argparse parser = argparse.ArgumentParser(description='') parser.add_argument('--storage_type', default='redis') parser.add_argument('--storage_address', nargs='?', dest='storage_addresses') modules = [yakonfig, kvlayer] args = yakonfig.parse_args(parser, modules) config = yakonfig.get_global_config() if not args.storage_addresses: args.storage_addresses = ['redis.diffeo.com:6379'] config['kvlayer'].update({ 'storage_type': args.storage_type, 'storage_addresses': args.storage_addresses, }) client = kvlayer.client() scan_batch_size(client)
def main(): '''Run the random recommender system on a sequence of topics. ''' description = ( 'A baseline recommender system that uses the truth data to' ' create output that has perfect recall and would also have' ' perfect precision if you ignore subtopic diversity/novelty.' ' This generates output directly from the truth data and' ' randomly shuffles the truth data per topic, so that' ' the ordering of passages does not attempt to optimize any' ' particular quality metric.') parser = argparse.ArgumentParser(description=description) parser.add_argument('--overwrite', action='store_true') args = yakonfig.parse_args(parser, [yakonfig]) logging.basicConfig(level=logging.DEBUG) config = yakonfig.get_global_config('harness') batch_size = config.get('batch_size', 5) run_file_path = config['run_file_path'] if os.path.exists(run_file_path): if args.overwrite: os.remove(run_file_path) else: sys.exit('%r already exists' % run_file_path) kvl_config = { 'storage_type': 'local', 'namespace': 'test', 'app_name': 'test' } kvl = kvlayer.client(kvl_config) label_store = LabelStore(kvl) parse_truth_data(label_store, config['truth_data_path']) # Set up the system doc_store = make_doc_store(label_store) system = RandomSystem(doc_store) ambassador = HarnessAmbassadorCLI(system, args.config, batch_size) ambassador.run()
def main(): '''Run the random recommender system on a sequence of topics. ''' description = ('A baseline recommender system that uses the truth data to' ' create output that has perfect recall and would also have' ' perfect precision if you ignore subtopic diversity/novelty.' ' This generates output directly from the truth data and' ' randomly shuffles the truth data per topic, so that' ' the ordering of passages does not attempt to optimize any' ' particular quality metric.') parser = argparse.ArgumentParser(description=description) parser.add_argument('--overwrite', action='store_true') args = yakonfig.parse_args(parser, [yakonfig]) logging.basicConfig(level=logging.DEBUG) config = yakonfig.get_global_config('harness') batch_size = config.get('batch_size', 5) run_file_path = config['run_file_path'] if os.path.exists(run_file_path): if args.overwrite: os.remove(run_file_path) else: sys.exit('%r already exists' % run_file_path) kvl_config = {'storage_type': 'local', 'namespace': 'test', 'app_name': 'test'} kvl = kvlayer.client(kvl_config) label_store = LabelStore(kvl) parse_truth_data(label_store, config['truth_data_path']) # Set up the system doc_store = make_doc_store(label_store) system = RandomSystem(doc_store) ambassador = HarnessAmbassadorCLI(system, args.config, batch_size) ambassador.run()
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-n', default=2, type=int, help='the n of the ngrams') args = yakonfig.parse_args(parser, [yakonfig, dblogger]) n = args.n ## note, these are initialized to be lower case corpora = initialize_corpora() # which = ['male', 'female', 'english'] which =['english'] for corpus_name in which: corpus = corpora[corpus_name] stats = compute_statistics(corpus, n) ## still needs to be saved somewhere ## also need to remember how default dict works in order to ## hapax legomenoma it import pdb; pdb.set_trace()
def main(): from memex_dossier.models.web.config import Config p = argparse.ArgumentParser('Ingest AKA records into ElasticSearch.') p.add_argument('--k-replicas', default=1, type=int) p.add_argument('--buffer-size', default=100, type=int) p.add_argument('--delete', action='store_true', default=False) p.add_argument('--parent') p.add_argument('--query') p.add_argument('--make-pairs', help='path to a csv file to create') p.add_argument('--input-format', default=None) p.add_argument('--ingest', nargs='+', help='record files in gzipped CBOR or an ETL format.') p.add_argument('--analyze', action='store_true', default=False, help='output analysis of all clusters') p.add_argument('--limit', default=None, type=int, help='number of records to process.') config = Config() args = yakonfig.parse_args(p, [dblogger, config, kvlayer, yakonfig]) logging.basicConfig(level=logging.DEBUG) aka = config.akagraph aka.buffer_size = args.buffer_size if args.parent: data = [ aka.get_parent(AKANode(unicode(args.parent), i)) for i in aka.replica_list ] logger.info(data) sys.exit() if args.query: cluster = [] ccs = aka.find_connected_component(args.query) for rec, confidence in ccs: rec['confidence'] = confidence cluster.append(rec) if not args.make_pairs: logger.info(json.dumps(cluster, indent=4, sort_keys=True)) else: assert args.make_pairs.endswith('.csv'),\ '--make-pairs must end in ".csv"' # make the four-column Memex eval format # these functions are used in the loop below def make_name(rec): vals = [] for key in ['email', 'name', 'phone', 'bitcoin']: if key not in rec: continue vals.extend(rec[key]) return u','.join(vals) def domain(rec): parts = rec['url'].split('/') if len(parts) < 3: logger.debug(parts) return parts[0] return parts[2] def tld(rec): return domain(rec).split('.')[-1] def bad_username(un): if un[0] in set(string.digits): return True for bad_word in [ 'Home', 'Wish', 'Cart', 'Shopping', 'Account', 'User' ]: if bad_word in un: return True if len(set(un) - set(string.digits + string.letters)) > 0: return True return False with open(args.make_pairs, 'ab') as fh: writer = csv.writer(fh) # consider all pairs for i in range(len(cluster)): for j in range(i, len(cluster)): r1 = cluster[i] r2 = cluster[j] if 'bogus' in r1['url']: continue if 'bogus' in r2['url']: continue c1 = r1['confidence'] c2 = r2['confidence'] if not (1 <= c1 or 1 <= c2): continue score = min(r1['confidence'], r2['confidence']) pair_key = ','.join( [args.query, domain(r1), domain(r2)]) tld_key = ','.join(sorted([tld(r1), tld(r2)])) identifier_types = [ 'email', 'username' ] #### IGNORE phone and bitconin and name for k1 in identifier_types: if k1 not in r1: continue for k2 in identifier_types: if k2 not in r2: continue identifier_type_key = ','.join(sorted([k1, k2])) for n1 in r1[k1]: for n2 in r2[k2]: if k1 == 'email': n1 = n1.split('@')[0] if k2 == 'email': n2 = n2.split( '@' )[0] ### they only want to *see* a username-like string if bad_username(n1): continue if bad_username(n2): continue row = ( r1['url'], n1.encode('utf8'), r2['url'], n2.encode('utf8'), score, identifier_type_key, pair_key, tld_key, ) writer.writerow(row) sys.exit() if args.delete: aka.delete_index() sys.exit() if args.input_format: loader = get_etl_transforms(args.input_format) else: loader = None if args.ingest: logger.debug('running ingest with loader=%r: %r', loader, aka) run_ingest(args, loader, aka) if args.analyze: stats = aka.analyze_clusters(limit=args.limit) print(json.dumps(stats, indent=4, sort_keys=True)) sys.exit()
def main(): p = argparse.ArgumentParser(description="Interact with the Dossier feature collection store.") app = App() app.add_arguments(p) args = yakonfig.parse_args(p, [kvlayer, yakonfig, Store]) app.main(args)
# else: # ## phone numbers # n = list() # for idx in xrange(10): # n.append(random.randint(0,9)) # num = (n[0], n[1], n[2], n[3], n[4], n[5], n[6], n[7], n[8], n[9]) # if random.random() < 0.5: # word = '(%d%d%d) %d%d%d-%d%d%d%d' % num # else: # word = '%d%d%d-%d%d%d-%d%d%d%d' % num if not word in examples: examples.add(word) break return list(examples) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('N', help='desired corpus size', type=int) args = yakonfig.parse_args(parser, [yakonfig, dblogger]) N = args.N examples = create_corpus(N) with open('negative-training-data-%d.txt' % N, 'w') as f: for word in examples: f.write(word + '\n')
def toy_main(): parser = argparse.ArgumentParser() parser.add_argument('--thing') modules = [yakonfig] args = yakonfig.parse_args(parser, modules) config = yakonfig.get_global_config()