def main(): """ Test the KGTK compact processor. TODO: Support the list output file. """ parser: ArgumentParser = ArgumentParser() parser.add_argument( dest="input_file_path", help="The KGTK file with the input data (default=%(default)s)", type=Path, nargs="?", default="-") parser.add_argument( "--columns", dest="key_column_names", help="The key columns to identify records for compaction. " + "(default=id for node files, (node1, label, node2, id) for edge files).", nargs='+', default=[]) parser.add_argument( "--keep-first", dest="keep_first_names", help= "If compaction results in a list of values for any column on this list, keep only the first value after sorting. " + "(default=none).", nargs='+', default=[]) parser.add_argument( "--compact-id", dest="compact_id", help= "Indicate that the ID column in KGTK edge files should be compacted. " + "Normally, if the ID column exists, it is not compacted, " + "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--deduplicate", dest="deduplicate", help= "Treat all columns as key columns, overriding --columns and --compact-id. " + "This will remove completely duplicate records without compacting any new lists. " + "(default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( "--presorted", dest="sorted_input", help= "Indicate that the input has been presorted (or at least pregrouped). (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--verify-sort", dest="verify_sort", help= "If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--lists-in-input", dest="lists_in_input", help= "Assume that the input file may contain lists (disable when certain it does not). (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--report-lists", dest="report_lists", help= "When True, report records with lists to the error output. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--exclude-lists", dest="exclude_lists", help= "When True, exclude records with lists from the output. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--output-only-lists", dest="output_only_lists", help= "When True, output only records containing lists. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--build-id", dest="build_id", help="Build id values in an id column. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkIdBuilderOptions.add_arguments(parser) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args( args) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("input: %s" % str(args.input_file_path), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True) print("--columns %s" % " ".join(args.key_column_names), file=error_file, flush=True) print("--keep-first %s" % " ".join(args.keep_first_names), file=error_file, flush=True) print("--compact-id=%s" % str(args.compact_id), file=error_file, flush=True) print("--deduplicate=%s" % str(args.deduplicate), file=error_file, flush=True) print("--presorted=%s" % str(args.sorted_input), file=error_file, flush=True) print("--verify-sort=%s" % str(args.verify_sort), file=error_file, flush=True) print("--lists-in-input=%s" % str(args.lists_in_input), file=error_file, flush=True) print("--report-lists=%s" % str(args.report_lists), file=error_file, flush=True) print("--exclude-lists=%s" % str(args.exclude_lists), file=error_file, flush=True) print("--output-only-lists=%s" % str(args.output_only_lists), file=error_file, flush=True) print("--build-id=%s" % str(args.build_id), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) kc: KgtkCompact = KgtkCompact(input_file_path=args.input_file_path, key_column_names=args.key_column_names, keep_first_names=args.keep_first_names, compact_id=args.compact_id, deduplicate=args.deduplicate, sorted_input=args.sorted_input, verify_sort=args.verify_sort, lists_in_input=args.lists_in_input, report_lists=args.report_lists, exclude_lists=args.exclude_lists, output_only_lists=args.output_only_lists, output_file_path=args.output_file_path, build_id=args.build_id, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) kc.process()
def main(): """ Test the KGTK implode processor. """ parser: ArgumentParser = ArgumentParser() parser.add_argument( dest="input_file_path", help="The KGTK file with the input data. (default=%(default)s)", type=Path, nargs="?", default="-") parser.add_argument( "--column", dest="column_name", help="The name of the column to explode. (default=%(default)s).", default="node2") parser.add_argument( "--types", dest="type_names", nargs='*', help= "The KGTK data types for which fields should be imploded. (default=%(default)s).", choices=KgtkFormat.DataType.choices(), default=KgtkFormat.DataType.choices()) parser.add_argument( "--without", dest="without_fields", nargs='*', help="The KGTK fields to do without. (default=%(default)s).", choices=KgtkValueFields.OPTIONAL_DEFAULT_FIELD_NAMES, default=None) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--prefix", dest="prefix", help="The prefix for exploded column names. (default=%(default)s).", default="node2;kgtk:") parser.add_argument( "--overwrite", dest="overwrite_column", help= "Indicate that it is OK to overwrite an existing imploded column. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--validate", dest="validate", help="Validate imploded values. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--escape-pipes", dest="escape_pipes", help= "When true, pipe characters (|) need to be escaped (\\|) per KGTK file format. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--quantities-include-numbers", dest="quantities_include_numbers", help= "When true, numbers are acceptable quantities. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--general-strings", dest="general_strings", help= "When true, strings may include language qualified strings. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--remove-prefixed-columns", dest="remove_prefixed_columns", help= "When true, remove all columns beginning with the prefix from the output file. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--ignore-unselected-types", dest="ignore_unselected_types", help= "When true, input records with valid but unselected data types will be passed through to output. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--retain-unselected-types", dest="retain_unselected_types", help= "When true, input records with valid but unselected data types will be retain existing data on output. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--build-id", dest="build_id", help="Build id values in an id column. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--reject-file", dest="reject_file_path", help= "The KGTK file into which to write rejected records (default=%(default)s).", type=Path, default=None) KgtkIdBuilderOptions.add_arguments(parser) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args( args) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: # TODO: show ifempty-specific options. print("input: %s" % str(args.input_file_path), file=error_file, flush=True) print("--column %s" % args.column_name, file=error_file, flush=True) print("--prefix %s" % args.prefix, file=error_file, flush=True) print("--overwrite %s" % str(args.overwrite_column), file=error_file, flush=True) print("--validate %s" % str(args.validate), file=error_file, flush=True) print("--escape-pipes %s" % str(args.escape_pipes), file=error_file, flush=True) print("--quantities-include-numbers %s" % str(args.quantities_include_numbers), file=error_file, flush=True) print("--general-strings %s" % str(args.general_strings), file=error_file, flush=True) print("--remove-prefixed-columns %s" % str(args.remove_prefixed_columns), file=error_file, flush=True) print("--ignore-unselected-types %s" % str(args.ignore_unselected_types), file=error_file, flush=True) print("--retain-unselected-types %s" % str(args.retain_unselected_types), file=error_file, flush=True) if args.type_names is not None: print("--types %s" % " ".join(args.type_names), file=error_file, flush=True) if args.without_fields is not None: print("--without %s" % " ".join(args.without_fields), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True) if args.reject_file_path is not None: print("--reject-file=%s" % str(args.reject_file_path), file=error_file, flush=True) print("--build-id=%s" % str(args.build_id), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) without_fields: typing.List[ str] = args.without_fields if args.without_fields is not None else list( ) ex: KgtkImplode = KgtkImplode( input_file_path=args.input_file_path, column_name=args.column_name, prefix=args.prefix, type_names=args.type_names, without_fields=without_fields, overwrite_column=args.overwrite_column, validate=args.validate, escape_pipes=args.escape_pipes, quantities_include_numbers=args.quantities_include_numbers, general_strings=args.general_strings, remove_prefixed_columns=args.remove_prefixed_columns, ignore_unselected_types=args.ignore_unselected_types, retain_unselected_types=args.retain_unselected_types, output_file_path=args.output_file_path, reject_file_path=args.reject_file_path, build_id=args.build_id, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) ex.process()
def main(): """ Test the KGTK ntriples importer. """ parser: ArgumentParser = ArgumentParser() parser.add_argument( "-i", "--input-files", dest="input_file_paths", nargs='*', help="The file(s) with the input ntriples data. (default=%(default)s)", type=Path, default="-") parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--reject-file", dest="reject_file_path", help= "The KGTK file into which to write rejected records. (default=%(default)s).", type=Path, default=None) parser.add_argument( "--namespace-file", dest="namespace_file_path", help="The KGTK file with known namespaces. (default=%(default)s).", type=Path, default=None) parser.add_argument( "--updated-namespace-file", dest="updated_namespace_file_path", help= "An updated KGTK file with known namespaces. (default=%(default)s).", type=Path, default=None) KgtkNtriples.add_arguments(parser) KgtkIdBuilderOptions.add_arguments(parser) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args( args) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("--input-files %s" % " ".join([str(path) for path in input_file_paths]), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True) # TODO: show ifempty-specific options. if args.reject_file_path is not None: print("--reject-file=%s" % str(args.reject_file_path), file=error_file, flush=True) if args.namespace_file_path is not None: print("--namespace-file=%s" % str(args.namespace_file_path), file=error_file, flush=True) if args.updated_namespace_file_path is not None: print("--updated-namespace-file=%s" % str(args.updated_namespace_file_path), file=error_file, flush=True) print("--namespace-id-prefix %s" % args.namespace_id_prefix, file=error_file, flush=True) print("--namespace-id-use-uuid %s" % str(args.namespace_id_use_uuid), file=error_file, flush=True) print("--namespace-id-counter %s" % str(args.namespace_id_counter), file=error_file, flush=True) print("--namespace-id-zfill %s" % str(args.namespace_id_zfill), file=error_file, flush=True) print("--output-only-used-namespaces %s" % str(args.output_only_used_namespaces), file=error_file, flush=True) print("--allow-lax-uri %s" % str(args.allow_lax_uri), file=error_file, flush=True) print("--local-namespace-prefix %s" % args.local_namespace_prefix, file=error_file, flush=True) print("--local-namespace-use-uuid %s" % str(args.local_namespace_use_uuid), file=error_file, flush=True) print("--prefix-expansion-label %s" % args.prefix_expansion_label, file=error_file, flush=True) print("--structured-value-label %s" % args.structured_value_label, file=error_file, flush=True) print("--structured-uri-label %s" % args.structured_uri_label, file=error_file, flush=True) print("--newnode-prefix %s" % args.newnode_prefix, file=error_file, flush=True) print("--newnode-use-uuid %s" % str(args.newnode_use_uuid), file=error_file, flush=True) print("--newnode-counter %s" % str(args.newnode_counter), file=error_file, flush=True) print("--newnode-zfill %s" % str(args.newnode_zfill), file=error_file, flush=True) print("--build-id=%s" % str(args.build_id), file=error_file, flush=True) print("--escape-pipes=%s" % str(args.escape_pipes), file=error_file, flush=True) print("--validate=%s" % str(args.validate), file=error_file, flush=True) if args.override_uuid is not None: print("--override_uuid=%s" % str(args.override_uuid), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) kn: KgtkNtriples = KgtkNtriples( input_file_paths=args.input_file_paths, output_file_path=args.output_file_path, reject_file_path=args.reject_file_path, namespace_file_path=args.namespace_file_path, updated_namespace_file_path=args.updated_namespace_file_path, namespace_id_prefix=args.namespace_id_prefix, namespace_id_use_uuid=args.namespace_id_use_uuid, namespace_id_counter=args.namespace_id_counter, namespace_id_zfill=args.namespace_id_zfill, output_only_used_namespaces=args.output_only_used_namespaces, newnode_prefix=args.newnode_prefix, newnode_use_uuid=args.newnode_use_uuid, newnode_counter=args.newnode_counter, newnode_zfill=args.newnode_zfill, allow_lax_uri=args.allow_lax_uri, local_namespace_prefix=args.local_namespace_prefix, local_namespace_use_uuid=args.local_namespace_use_uuid, prefix_expansion_label=args.prefix_expansion_label, structured_value_label=args.structured_value_label, structured_uri_label=args.structured_uri_label, build_id=args.build_id, escape_pipes=args.escape_pipes, idbuilder_options=idbuilder_options, validate=args.validate, override_uuid=args.override_uuid, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) kn.process()
def main(): """ Test the KGTK compact processor. """ parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="input_file_path", help="The KGTK file with the input data (default=%(default)s)", type=Path, nargs="?", default="-") parser.add_argument( "--columns", dest="key_column_names", help="The key columns to identify records for compaction. " + "(default=id for node files, (node1, label, node2, id) for edge files).", nargs='+', default=[ ]) parser.add_argument( "--compact-id", dest="compact_id", help="Indicate that the ID column in KGTK edge files should be compacted. " + "Normally, if the ID column exists, it is not compacted, " + "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--presorted", dest="sorted_input", help="Indicate that the input has been presorted (or at least pregrouped). (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--verify-sort", dest="verify_sort", help="If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--build-id", dest="build_id", help="Build id values in an id column. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkIdBuilderOptions.add_arguments(parser) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(args) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("input: %s" % str(args.input_file_path), file=error_file, flush=True) print("--columns %s" % " ".join(args.key_column_names), file=error_file, flush=True) print("--compact-id=%s" % str(args.compact_id), file=error_file, flush=True) print("--presorted=%s" % str(args.sorted_input), file=error_file, flush=True) print("--verify-sort=%s" % str(args.verify_sort), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True) print("--build-id=%s" % str(args.build_id), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) kc: KgtkCompact = KgtkCompact( input_file_path=args.input_file_path, key_column_names=args.key_column_names, compact_id=args.compact_id, sorted_input=args.sorted_input, verify_sort=args.verify_sort, output_file_path=args.output_file_path, build_id=args.build_id, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) kc.process()