def db_out(set_id, out_dir=None, answer=None, flagged_only=False, dry=False): """ Export annotations from the database. Files will be exported in Prodigy's JSONL format. """ DB = connect() if set_id not in DB: prints("Can't find '{}' in database {}.".format(set_id, DB.db_name), exits=1, error=True) examples = DB.get_dataset(set_id) if flagged_only: examples = [eg for eg in examples if eg.get('flagged')] if answer: examples = [eg for eg in examples if eg.get('answer') == answer] if out_dir is None: for eg in examples: print(ujson.dumps(eg, escape_forward_slashes=False)) else: if not out_dir.exists(): out_dir.mkdir() out_file = out_dir / '{}.jsonl'.format(set_id) if not dry: write_jsonl(out_file, examples) prints( "Exported {} annotations for '{}' from database {}".format( len(examples), set_id, DB.db_name), out_file.resolve())
def stats(set_id=None, list_datasets=False, list_sessions=False, no_format=False): """ Print Prodigy and database statistics. Specifying a dataset ID will show detailed stats for the set. """ DB = connect() prodigy_stats = { 'version': about.__version__, 'location': str(Path(__file__).parent), 'prodigy_home': PRODIGY_HOME, 'platform': platform.platform(), 'python_version': platform.python_version(), 'database_name': DB.db_name, 'database_id': DB.db_id, 'total_datasets': len(DB.datasets), 'total_sessions': len(DB.sessions) } print_stats('Prodigy stats', prodigy_stats, no_format=no_format) if (list_datasets or list_sessions) and len(DB.datasets): print_stats('Datasets', DB.datasets, no_format, False) if list_sessions and len(DB.sessions): print_stats('Sessions', DB.sessions, no_format, False) if set_id: if set_id not in DB: prints("Can't find '{}' in database {}.".format( set_id, DB.db_name), exits=1, error=True) examples = DB.get_dataset(set_id) meta = DB.get_meta(set_id) decisions = {'accept': 0, 'reject': 0, 'ignore': 0} for eg in examples: if 'answer' in eg: decisions[eg['answer']] += 1 elif 'spans' in eg: for span in eg['spans']: if 'answer' in span: decisions[span['answer']] += 1 dataset_stats = { 'dataset': set_id, 'created': meta.get('created'), 'description': meta.get('description'), 'author': meta.get('author'), 'annotations': len(examples), 'accept': decisions['accept'], 'reject': decisions['reject'], 'ignore': decisions['ignore'] } print_stats("Dataset '{}'".format(set_id), dataset_stats, no_format=no_format)
def to_patterns(dataset=None, label=None, output_file=None): """ Convert a list of seed phrases to a list of match patterns that can be used with ner.match. If no output file is specified, each pattern is printed so the recipe's output can be piped forward to ner.match. This is pretty much an exact copy of terms.to-patterns. The pattern for each example is just split on whitespace so instead of: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new balance"}]} which won't match anything you'll get: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} """ if label is None: prints( "--label is a required argument", "This is the label that will be assigned to all patterns " "created from terms collected in this dataset. ", exits=1, error=True, ) DB = connect() def get_pattern(term, label): return { "label": label, "pattern": [{ "lower": t.lower() } for t in term["text"].split()] } log("RECIPE: Starting recipe phrases.to-patterns", locals()) if dataset is None: log("RECIPE: Reading input terms from sys.stdin") terms = (srsly.json_loads(line) for line in sys.stdin) else: if dataset not in DB: prints("Can't find dataset '{}'".format(dataset), exits=1, error=True) terms = DB.get_dataset(dataset) log("RECIPE: Reading {} input phrases from dataset {}".format( len(terms), dataset)) if output_file: patterns = [ get_pattern(term, label) for term in terms if term["answer"] == "accept" ] log("RECIPE: Generated {} patterns".format(len(patterns))) srsly.write_jsonl(output_file, patterns) prints("Exported {} patterns".format(len(patterns)), output_file) else: log("RECIPE: Outputting patterns") for term in terms: if term["answer"] == "accept": print(srsly.json_dumps(get_pattern(term, label)))
def drop(set_id): """ Remove a dataset. Can't be undone. For a list of all dataset and session IDs in the database, use `prodigy stats -ls`. """ DB = connect() if set_id not in DB: prints("Can't find '{}' in database {}.".format(set_id, DB.db_name), exits=1, error=True) dropped = DB.drop_dataset(set_id) if not dropped: prints("Can't remove '{}' from database {}.".format( set_id, DB.db_name), exits=1, error=True) prints("Removed '{}' from database {}.".format(set_id, DB.db_name), exits=1)
def dataset(set_id, description=None, author=None): """ Create a new Prodigy dataset. This lets you assign meta information, like a description, and will add the new set to the database. In order to collect annotations and save the results, Prodigy expects a dataset ID to exist in the database. """ DB = connect() if set_id in DB: prints("'{}' already exists in database {}.".format( set_id, DB.db_name), exits=True, error=True) meta = {'description': description, 'author': author} created = DB.add_dataset(set_id, meta) if not created: prints("Couldn't add {} to database {}.".format(set_id, DB.db_name), exits=1, error=True) prints("Successfully added '{}' to database {}.".format( set_id, DB.db_name))
def db_in(set_id, in_file, loader=None, answer='accept', overwrite=False, dry=False): """ Import annotations to the database. Supports all formats loadable by Prodigy. """ DB = connect() if not in_file.exists() or not in_file.is_file(): prints("Not a valid input file.", in_file, exits=1, error=True) if set_id not in DB: prints("Can't find '{}' in database {}.".format(set_id, DB.db_name), "Maybe you misspelled the name or forgot to add the dataset " "using the `dataset` command?", exits=1, error=True) loader = get_loader(loader, file_path=in_file) annotations = loader(in_file) annotations = [set_hashes(eg) for eg in annotations] added_answers = 0 for task in annotations: if 'answer' not in task or overwrite: task['answer'] = answer added_answers += 1 session_id = get_timestamp_session_id() if not dry: DB.add_dataset(session_id, session=True) DB.add_examples(annotations, datasets=[set_id, session_id]) prints( "Imported {} annotations for '{}' to database {}".format( len(annotations), set_id, DB.db_name), "Added '{}' answer to {} annotations".format(answer, added_answers), "Session ID: {}".format(session_id))
'stats': stats, 'pipe': pipe, 'db-in': db_in, 'db-out': db_out } help_args = ('--help', '-h', 'help') if len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in help_args): recipes = list_recipes() ner_recipes = [r for r in recipes if r.startswith('ner')] textcat_recipes = [r for r in recipes if r.startswith('textcat')] other_recipes = [ r for r in recipes if not r.startswith('ner') and not r.startswith('textcat') ] prints("Available recipes:", ', '.join(ner_recipes), '\n', ', '.join(textcat_recipes), '\n', ', '.join(other_recipes)) prints("Available commands:", ', '.join(commands.keys()), exits=1) command = sys.argv.pop(1) sys.argv[0] = 'prodigy {}'.format(command) args = sys.argv[1:] if command in commands: plac.call(commands[command], arglist=args, eager=False) else: path = None if '-F' in args: path = args.pop(args.index('-F') + 1) args.pop(args.index('-F')) recipe = get_recipe(command, path=path) if recipe: controller = recipe(*args, use_plac=True)