def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions _expert: bool = parsed_shared_args._expert # This helper function makes it easy to suppress options from # The help message. The options are still there, and initialize # what they need to initialize. def h(msg: str) -> str: if _expert: return msg else: return SUPPRESS parser.add_input_file(positional=True) parser.add_output_file() parser.add_argument("--column", dest="column_name", help="The column to count unique values (required).", required=True) parser.add_argument( "--empty", dest="empty_value", help="A value to substitute for empty values (default=%(default)s).", default="") parser.add_argument( "--label", dest="label_value", help="The output file label column value (default=%(default)s).", default="count") # TODO: use an emum parser.add_argument( "--format", dest="output_format", help=h("The output file format and mode (default=%(default)s)."), default="edge", choices=["edge", "node"]) parser.add_argument("--prefix", dest="prefix", help=h("The value prefix (default=%(default)s)."), default="") parser.add_argument( "--where", dest="where_column_name", help= "The name of a column for a record selection test. (default=%(default)s).", default=None) parser.add_argument( "--in", dest="where_values", nargs="+", help= "The list of values for a record selection test. (default=%(default)s).", default=None) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def main(): """ Test the KGTK implode processor. """ parser: ArgumentParser = ArgumentParser() parser.add_argument( dest="input_file_path", help="The KGTK file with the input data. (default=%(default)s)", type=Path, nargs="?", default="-") parser.add_argument( "--column", dest="column_name", help="The name of the column to explode. (default=%(default)s).", default="node2") parser.add_argument( "--types", dest="type_names", nargs='*', help= "The KGTK data types for which fields should be imploded. (default=%(default)s).", choices=KgtkFormat.DataType.choices(), default=KgtkFormat.DataType.choices()) parser.add_argument( "--without", dest="without_fields", nargs='*', help="The KGTK fields to do without. (default=%(default)s).", choices=KgtkValueFields.OPTIONAL_DEFAULT_FIELD_NAMES, default=None) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--prefix", dest="prefix", help="The prefix for exploded column names. (default=%(default)s).", default="node2;kgtk:") parser.add_argument( "--overwrite", dest="overwrite_column", help= "Indicate that it is OK to overwrite an existing imploded column. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--validate", dest="validate", help="Validate imploded values. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--escape-pipes", dest="escape_pipes", help= "When true, pipe characters (|) need to be escaped (\\|) per KGTK file format. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--quantities-include-numbers", dest="quantities_include_numbers", help= "When true, numbers are acceptable quantities. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--general-strings", dest="general_strings", help= "When true, strings may include language qualified strings. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--remove-prefixed-columns", dest="remove_prefixed_columns", help= "When true, remove all columns beginning with the prefix from the output file. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--ignore-unselected-types", dest="ignore_unselected_types", help= "When true, input records with valid but unselected data types will be passed through to output. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--retain-unselected-types", dest="retain_unselected_types", help= "When true, input records with valid but unselected data types will be retain existing data on output. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--build-id", dest="build_id", help="Build id values in an id column. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--reject-file", dest="reject_file_path", help= "The KGTK file into which to write rejected records (default=%(default)s).", type=Path, default=None) KgtkIdBuilderOptions.add_arguments(parser) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args( args) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: # TODO: show ifempty-specific options. print("input: %s" % str(args.input_file_path), file=error_file, flush=True) print("--column %s" % args.column_name, file=error_file, flush=True) print("--prefix %s" % args.prefix, file=error_file, flush=True) print("--overwrite %s" % str(args.overwrite_column), file=error_file, flush=True) print("--validate %s" % str(args.validate), file=error_file, flush=True) print("--escape-pipes %s" % str(args.escape_pipes), file=error_file, flush=True) print("--quantities-include-numbers %s" % str(args.quantities_include_numbers), file=error_file, flush=True) print("--general-strings %s" % str(args.general_strings), file=error_file, flush=True) print("--remove-prefixed-columns %s" % str(args.remove_prefixed_columns), file=error_file, flush=True) print("--ignore-unselected-types %s" % str(args.ignore_unselected_types), file=error_file, flush=True) print("--retain-unselected-types %s" % str(args.retain_unselected_types), file=error_file, flush=True) if args.type_names is not None: print("--types %s" % " ".join(args.type_names), file=error_file, flush=True) if args.without_fields is not None: print("--without %s" % " ".join(args.without_fields), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True) if args.reject_file_path is not None: print("--reject-file=%s" % str(args.reject_file_path), file=error_file, flush=True) print("--build-id=%s" % str(args.build_id), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) without_fields: typing.List[ str] = args.without_fields if args.without_fields is not None else list( ) ex: KgtkImplode = KgtkImplode( input_file_path=args.input_file_path, column_name=args.column_name, prefix=args.prefix, type_names=args.type_names, without_fields=without_fields, overwrite_column=args.overwrite_column, validate=args.validate, escape_pipes=args.escape_pipes, quantities_include_numbers=args.quantities_include_numbers, general_strings=args.general_strings, remove_prefixed_columns=args.remove_prefixed_columns, ignore_unselected_types=args.ignore_unselected_types, retain_unselected_types=args.retain_unselected_types, output_file_path=args.output_file_path, reject_file_path=args.reject_file_path, build_id=args.build_id, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) ex.process()
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions _expert: bool = parsed_shared_args._expert parser.add_input_file() parser.add_input_file(who="The entity label file(s)", dest="entity_label_files", options=['--entity-label-file'], metavar="ENTITY_LABEL_FILE", optional=True, allow_list=True, default_stdin=False) parser.add_output_file() parser.add_argument("--label-properties", dest="label_properties", nargs="*", help="The label properties. (default=%s)" % repr(DEFAULT_LABEL_PROPERTIES)) parser.add_argument("--description-properties", dest="description_properties", nargs="*", help="The description properties. (default=%s)" % repr(DEFAULT_DESCRIPTION_PROPERTIES)) parser.add_argument("--isa-properties", dest="isa_properties", nargs="*", help="The isa properties. (default=%s)" % repr(DEFAULT_ISA_PROPERTIES)) parser.add_argument("--has-properties", dest="has_properties", nargs="*", help="The has properties. (default=%s)" % repr(DEFAULT_HAS_PROPERTIES)) parser.add_argument("--property-values", dest="property_values", nargs="*", help="The property values. (default=%s)" % repr(DEFAULT_PROPERTY_VALUES)) parser.add_argument( '--sentence-label', action='store', type=str, dest='sentence_label', default=DEFAULT_SENTENCE_LABEL, help= "The relationship to write in the output file. (default=%(default)s)") parser.add_argument( "--explain", dest="explain", metavar="True|False", help= "When true, include an explanation column that tells how the sentence was constructed. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--presorted", dest="presorted", metavar="True|False", help= "When true, the input file is presorted on node1. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--add-entity-labels-from-input", dest="add_entity_labels_from_input", metavar="True|False", help= "When true, extract entity labels from the unsorted input file. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser, expert=False) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=False) KgtkValueOptions.add_arguments(parser, expert=False)
def main(): """ Test the KGTK file joiner. """ parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="input_file_path", help="The KGTK file with the input data", type=Path, nargs="?") parser.add_argument("--filter-on", dest="filter_file_path", help="The KGTK file with the filter data (required).", type=Path, required=True) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--field-separator", dest="field_separator", help="Separator for multifield keys (default=%(default)s)", default=KgtkIfExists.FIELD_SEPARATOR_DEFAULT) parser.add_argument( "--invert", dest="invert", help="Invert the test (if not exists) (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--cache-input", dest="cache_input", help= "Cache the input file instead of the filter keys. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--preserve-order", dest="preserve_order", help= "Preserve record order when cacheing the input file. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--input-keys", dest="input_keys", help="The key columns in the input file (default=None).", nargs='*') parser.add_argument( "--filter-keys", dest="filter_keys", help="The key columns in the filter file (default=None).", nargs='*') KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input") KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter") KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args( args, who="input") filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args( args, who="filter") value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("input: %s" % (str(args.input_file_path) if args.input_file_path is not None else "-"), file=error_file) print("--filter-on=%s" % str(args.filter_file_path), file=error_file) print("--output-file=%s" % str(args.output_file_path), file=error_file) print("--field-separator=%s" % repr(args.field_separator), file=error_file) print("--invert=%s" % str(args.invert), file=error_file) print("--cache-input=%s" % str(args.cache_input), file=error_file) print("--preserve-order=%s" % str(args.preserve_order), file=error_file) if args.input_keys is not None: print("--input-keys %s" % " ".join(args.input_keys), file=error_file) if args.filter_keys is not None: print("--filter-keys %s" % " ".join(args.filter_keys), file=error_file) input_reader_options.show(out=error_file, who="input") filter_reader_options.show(out=error_file, who="filter") value_options.show(out=error_file) ie: KgtkIfExists = KgtkIfExists( input_file_path=args.input_file_path, input_keys=args.input_keys, filter_file_path=args.filter_file_path, filter_keys=args.filter_keys, output_file_path=args.output_file_path, field_separator=args.field_separator, invert=args.invert, cache_input=args.cache_input, preserve_order=args.preserve_order, input_reader_options=input_reader_options, filter_reader_options=filter_reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) ie.process()
def main(): """ Test the KGTK file concatenator. """ parser = ArgumentParser() parser.add_argument(dest="input_file_paths", help="The KGTK files to concatenate", type=Path, nargs='+') parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s)", type=Path, default="-") parser.add_argument("--output-format", dest="output_format", help="The file format (default=kgtk)", type=str, choices=KgtkWriter.OUTPUT_FORMAT_CHOICES) parser.add_argument( "--output-columns", dest="output_column_names", help="Rename all output columns. (default=%(default)s)", type=str, nargs='+') parser.add_argument( "--old-columns", dest="old_column_names", help="Rename seleted output columns: old names. (default=%(default)s)", type=str, nargs='+') parser.add_argument( "--new-columns", dest="new_column_names", help="Rename seleted output columns: new names. (default=%(default)s)", type=str, nargs='+') KgtkReader.add_debug_arguments(parser, expert=True) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True) KgtkValueOptions.add_arguments(parser, expert=True) args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: input_files: typing.List[str] = [] input_file: Path for input_file in args.input_file_paths: input_files.append(str(input_file)) print("input: %s" % " ".join(input_files), file=error_file, flush=True) print("--output-file=%s" % args.output_file_path, file=error_file, flush=True) if args.output_format is not None: print("--output-format=%s" % args.output_format, file=error_file, flush=True) if args.output_column_names is not None: print("--output-columns=%s" % " ".join(args.output_column_names), file=error_file, flush=True) if args.old_column_names is not None: print("--old-columns=%s" % " ".join(args.old_column_names), file=error_file, flush=True) if args.new_column_names is not None: print("--new-columns=%s" % " ".join(args.new_column_names), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) kc: KgtkCat = KgtkCat(input_file_paths=args.input_file_paths, output_path=args.output_file_path, output_format=args.output_format, output_column_names=args.output_column_names, old_column_names=args.old_column_names, new_column_names=args.new_column_names, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) kc.process()
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalue import KgtkValueFields from kgtk.value.kgtkvalueoptions import KgtkValueOptions _expert: bool = parsed_shared_args._expert # This helper function makes it easy to suppress options from # The help message. The options are still there, and initialize # what they need to initialize. def h(msg: str) -> str: if _expert: return msg else: return SUPPRESS parser.add_input_file(positional=True) parser.add_output_file() parser.add_argument( "--column", dest="column_name", help="The name of the column to explode. (default=%(default)s).", default=KgtkFormat.NODE2) fgroup: _MutuallyExclusiveGroup = parser.add_mutually_exclusive_group() fgroup.add_argument( "--types", dest="type_names", nargs='*', help= "The KGTK data types for which fields should be exploded. (default=%(default)s).", choices=KgtkFormat.DataType.choices(), default=KgtkFormat.DataType.choices()) fgroup.add_argument( "--fields", dest="field_names", nargs='*', help= h("The names of the fields to extract (overrides --types). (default=%(default)s)." ), choices=KgtkValueFields.FIELD_NAMES) parser.add_argument( "--prefix", dest="prefix", help="The prefix for exploded column names. (default=%(default)s).", default=KgtkFormat.NODE2 + ";" + KgtkFormat.KGTK_NAMESPACE) parser.add_argument( "--overwrite", dest="overwrite_columns", metavar="True|False", help= "Indicate that it is OK to overwrite existing columns. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--expand", dest="expand_list", metavar="True|False", help= "When True, expand source cells that contain a lists, else fail if a source cell contains a list. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--show-data-types", dest="show_data_types", metavar="True|False", help="Print the list of data types and exit. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--show-field-names", dest="show_field_names", metavar="True|False", help="Print the list of field names and exit. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--show-field-formats", dest="show_field_formats", metavar="True|False", help= "Print the list of field names and formats, then exit. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument("--output-format", dest="output_format", help="The file format (default=kgtk)", type=str, choices=KgtkWriter.OUTPUT_FORMAT_CHOICES) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ _expert: bool = parsed_shared_args._expert # This helper function makes it easy to suppress options from # The help message. The options are still there, and initialize # what they need to initialize. def h(msg: str) -> str: if _expert: return msg else: return SUPPRESS parser.add_input_file(positional=True) parser.add_output_file() parser.add_output_file( who= "An optional output file for new edges (normalized and/or lowered). " + "If omitted, new edges will go in the main output file.", dest="new_edges_file", options=["--new-edges-file"], metavar="NEW_EDGES_FILE", optional=True) parser.add_argument( "--columns", "--columns-to-lower", "--columns-to-remove", action="store", type=str, dest="columns_to_lower", nargs='+', help= "Columns to lower and remove as a space-separated list. (default=all columns other than key columns)" ) parser.add_argument( "--base-columns", dest="base_columns", help= h("Optionally, explicitly list the base column for each column being lowered. " + " --base-columns and --columns-to-lower must have the same number of entries." ), nargs='*') parser.add_argument( "--label-value", action="store", type=str, dest="label_value", help= h("The label value to use for lowered edges when --base-columns is used. (default=%(default)s)" ), default=KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE) parser.add_argument( "--lift-separator", dest="lift_separator", help= h("The separator between the base column and the label value. (default=%(default)s)." ), default=KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR) parser.add_argument( "--lower", dest="lower", help= "When True, lower columns that match a lift pattern. (default=%(default)s)", type=optional_bool, nargs='?', const=True, default=True, metavar="True|False") parser.add_argument( "--normalize", dest="normalize", help= "When True, normalize columns that do not match a lift pattern. (default=%(default)s)", type=optional_bool, nargs='?', const=True, default=True, metavar="True|False") parser.add_argument( "--deduplicate-new-edges", dest="deduplicate_new_edges", help= "When True, deduplicate new edges. Not suitable for large files. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True, metavar="True|False") KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, default_mode=KgtkReaderMode.EDGE, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def main(): """ Test the KGTK ifempty processor. """ parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="input_file_path", help="The KGTK file with the input data", type=Path, nargs="?") parser.add_argument("--columns", dest="filter_column_names", help="The columns to filter on (default=None).", nargs='+', required=True) parser.add_argument( "--count", dest="only_count", help="Only count the records, do not copy them. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--all", dest="all_are", help= "False: Test if any are, True: test if all are (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--not-empty", dest="notempty", help= "False: test if empty, True: test if not empty (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: # TODO: show ifempty-specific options. reader_options.show(out=error_file) value_options.show(out=error_file) ie: KgtkIfEmpty = KgtkIfEmpty(input_file_path=args.input_file_path, filter_column_names=args.filter_column_names, output_file_path=args.output_file_path, all_are=args.all_are, notempty=args.notempty, only_count=args.only_count, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) ie.process()
def main(): """ Test the KGTK unique processor. """ parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="input_file_path", help="The KGTK file with the input data", type=Path, nargs="?") parser.add_argument( "--column", dest="column_name", help="The column to count unique values (required).", required=True) parser.add_argument( "--empty", dest="empty_value", help="A value to substitute for empty values (default=%(default)s).", default="") parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--label", dest="label_value", help="The output file label column value (default=%(default)s).", default="count") # TODO: use an enum parser.add_argument( "--format", dest="output_format", help="The output file format and mode (default=%(default)s).", default=Unique.DEFAULT_FORMAT, choices=Unique.OUTPUT_FORMATS) parser.add_argument( "--prefix", dest="prefix", help="The value prefix (default=%(default)s).", default="") parser.add_argument( "--where", dest="where_column_name", help="The name of a column for a record selection test. (default=%(default)s).", default=None) parser.add_argument( "--in", dest="where_values", nargs="+", help="The list of values for a record selection test. (default=%(default)s).", default=None) parser.add_argument( "--presorted", dest="presorted", metavar="True|False", help="When True, the input file is presorted. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("input: %s" % (str(args.input_file_path) if args.input_file_path is not None else "-"), file=error_file) print("--column=%s" % args.column_name, file=error_file) print("--empty=%s" % args.empty_value, file=error_file) print("--output-file=%s" % str(args.output_file_path), file=error_file) print("--label=%s" % args.label_value, file=error_file) print("--format=%s" % args.output_format, file=error_file) print("--prefix=%s" % args.prefix, file=error_file) if args.where_column_name is not None: print("--where=%s" % args.where_column_name, file=error_file) if args.where_values is not None and len(args.where_values) > 0: print("--in=%s" % " ".join(args.where_values), file=error_file) print("--prefix=%s" % repr(args.presorted), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) uniq: Unique = Unique( input_file_path=args.input_file_path, column_name=args.column_name, output_file_path=args.output_file_path, empty_value=args.empty_value, label_value=args.label_value, output_format=args.output_format, prefix=args.prefix, where_column_name=args.where_column_name, where_values=args.where_values, presorted=args.presorted, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) uniq.process()
def main(): """ Test the KGTK ifempty processor. """ parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="input_file_path", help="The KGTK file with the input data. (default=%(default)s)", type=Path, nargs="?", default="-") parser.add_argument( "--column", dest="column_name", help="The name of the column to explode. (default=%(default)s).", default="node2") fgroup: ArgumentParser = parser.add_mutually_exclusive_group() fgroup.add_argument( "--types", dest="type_names", nargs='*', help="The KGTK data types for which fields should be exploded. (default=%(default)s).", choices=KgtkFormat.DataType.choices(), default=KgtkFormat.DataType.choices()) fgroup.add_argument( "--fields", dest="field_names", nargs='*', help="The names of the fields to extract (overrides --types). (default=%(default)s).", choices=KgtkValueFields.FIELD_NAMES) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--prefix", dest="prefix", help="The prefix for exploded column names. (default=%(default)s).", default="node2;kgtk:") parser.add_argument( "--overwrite", dest="overwrite_columns", help="Indicate that it is OK to overwrite existing columns. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--expand", dest="expand_list", help="Expand the source column if it contains a list, else fail. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: # TODO: show ifempty-specific options. print("input: %s" % str(args.input_file_path), file=error_file, flush=True) print("--column %s" % args.column_name, file=error_file, flush=True) print("--prefix %s" % args.prefix, file=error_file, flush=True) print("--overwrite %s" % str(args.overwrite_columns), file=error_file, flush=True) print("--expand %s" % str(args.expand_list), file=error_file, flush=True) if args.field_names is not None: print("--fields %s" % " ".join(args.field_names), file=error_file, flush=True) if args.type_names is not None: print("--types %s" % " ".join(args.type_names), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path)) reader_options.show(out=error_file) value_options.show(out=error_file) ex: KgtkExplode = KgtkExplode( input_file_path=args.input_file_path, column_name=args.column_name, prefix=args.prefix, field_names=args.field_names, type_names=args.type_names, overwrite_columns=args.overwrite_columns, expand_list=args.expand_list, output_file_path=args.output_file_path, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) ex.process()
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ _expert: bool = parsed_shared_args._expert # '$label == "/r/DefinedAs" && $node2=="/c/en/number_zero"' parser.add_argument( "input_kgtk_file", nargs="?", help="The KGTK file to filter. May be omitted or '-' for stdin.", type=Path, default="-") parser.add_argument( "-o", "--output-file", dest="output_kgtk_file", help= "The KGTK file to write records that pass the filter (default=%(default)s).", type=Path, default="-") parser.add_argument( "--reject-file", dest="reject_kgtk_file", help= "The KGTK file to write records that fail the filter (default=%(default)s).", type=Path, default=None) # parser.add_argument('-dt', "--datatype", action="store", type=str, dest="datatype", help="Datatype of the input file, e.g., tsv or csv.", default="tsv") parser.add_argument( '-p', '--pattern', action="store", type=str, dest="pattern", help="Pattern to filter on, for instance, \" ; P154 ; \" ", required=True) parser.add_argument('--subj', action="store", type=str, dest='subj_col', help="Subject column, default is node1") parser.add_argument('--pred', action="store", type=str, dest='pred_col', help="Predicate column, default is label") parser.add_argument('--obj', action="store", type=str, dest='obj_col', help="Object column, default is node2") parser.add_argument( "--or", dest="or_pattern", help="'Or' the clauses of the pattern. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--invert", dest="invert", help= "Invert the result of applying the pattern. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ _expert: bool = parsed_shared_args._expert # This helper function makes it easy to suppress options from # The help message. The options are still there, and initialize # what they need to initialize. def h(msg: str) -> str: if _expert: return msg else: return SUPPRESS parser.add_input_file(positional=True) parser.add_input_file(who="The KGTK file to filter against.", options=["--filter-on"], dest="filter_file", metavar="FILTER_FILE") parser.add_output_file() parser.add_argument( "--input-keys", "--left-keys", dest="input_keys", help="The key columns in the file being filtered (default=None).", nargs='*') parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file (default=None).", nargs='*') parser.add_argument( "--cache-input", dest="cache_input", metavar="True|False", help= "Cache the input file instead of the filter keys (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--preserve-order", dest="preserve_order", metavar="True|False", help= "Preserve record order when cacheing the input file. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys (default=%(default)s)"), default=KgtkIfExists.FIELD_SEPARATOR_DEFAULT) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert, defaults=False) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter", expert=_expert, defaults=False) KgtkValueOptions.add_arguments(parser, expert=_expert)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions _expert: bool = parsed_shared_args._expert # '$label == "/r/DefinedAs" && $node2=="/c/en/number_zero"' parser.add_input_file(positional=True) parser.add_output_file( who="The KGTK output file for records that pass the filter.") parser.add_output_file( who="The KGTK reject file for records that fail the filter.", dest="reject_file", options=["--reject-file"], metavar="REJECT_FILE", optional=True) # parser.add_argument('-dt', "--datatype", action="store", type=str, dest="datatype", help="Datatype of the input file, e.g., tsv or csv.", default="tsv") parser.add_argument( '-p', '--pattern', action="store", type=str, dest="pattern", help="Pattern to filter on, for instance, \" ; P154 ; \" ", required=True) parser.add_argument('--subj', action="store", type=str, dest='subj_col', help="Subject column, default is node1") parser.add_argument('--pred', action="store", type=str, dest='pred_col', help="Predicate column, default is label") parser.add_argument('--obj', action="store", type=str, dest='obj_col', help="Object column, default is node2") parser.add_argument( "--or", dest="or_pattern", metavar="True|False", help="'Or' the clauses of the pattern. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--invert", dest="invert", metavar="True|False", help= "Invert the result of applying the pattern. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--show-version", dest="show_version", type=optional_bool, nargs='?', const=True, default=False, help="Print the version of this program. (default=%(default)s).", metavar="True/False") KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions _command: str = parsed_shared_args._command _expert: bool = parsed_shared_args._expert # This helper function makes it easy to suppress options from # The help message. The options are still there, and initialize # what they need to initialize. def h(msg: str) -> str: if _expert: return msg else: return SUPPRESS parser.add_input_file(positional=True) parser.add_output_file() if _command == DEDUP_COMMAND: parser.add_argument( "--columns", dest="key_column_names", help= h("The key columns to identify records for compaction. " + "(default=id for node files, (node1, label, node2, id) for edge files)." ), nargs='+', default=[]) parser.add_argument( "--compact-id", dest="compact_id", help= h("Indicate that the ID column in KGTK edge files should be compacted. " + "Normally, if the ID column exists, it is not compacted, " + "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s)." ), type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( "--deduplicate", dest="deduplicate", help= h("Treat all columns as key columns, overriding --columns and --compact-id. " + "This will remove completely duplicate records without compacting any new lists. " + "(default=%(default)s)."), type=optional_bool, nargs='?', const=True, default=True, metavar="True|False") parser.add_argument( "--lists-in-input", dest="lists_in_input", help= h("Assume that the input file may contain lists (disable when certain it does not). (default=%(default)s)." ), type=optional_bool, nargs='?', const=True, default=True) else: parser.add_argument( "--columns", dest="key_column_names", help="The key columns to identify records for compaction. " + "(default=id for node files, (node1, label, node2, id) for edge files).", nargs='+', default=[]) parser.add_argument( "--compact-id", dest="compact_id", help= "Indicate that the ID column in KGTK edge files should be compacted. " + "Normally, if the ID column exists, it is not compacted, " + "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( "--deduplicate", dest="deduplicate", help= "Treat all columns as key columns, overriding --columns and --compact-id. " + "This will remove completely duplicate records without compacting any new lists. " + "(default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( "--lists-in-input", dest="lists_in_input", help= "Assume that the input file may contain lists (disable when certain it does not). (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--presorted", dest="sorted_input", help= "Indicate that the input has been presorted (or at least pregrouped) (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( "--verify-sort", dest="verify_sort", help= "If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True, metavar="True|False") parser.add_argument( "--build-id", dest="build_id", help="Build id values in an id column. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") KgtkIdBuilderOptions.add_arguments(parser, expert=_expert) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions _expert: bool = parsed_shared_args._expert parser.add_input_file(positional=True, optional=False) parser.add_output_file() parser.add_argument( '--undirected', dest="undirected", help='Is the graph undirected? If false, then the graph is ' + ' treated as (node1)->(node2). If true, then the graph is ' + ' treated as (node1)<->(node2). ' + '\nAlso, HITS will not be computed on undirected graphs. ' + '\n(default=%(default)s)', type=optional_bool, nargs='?', const=True, default=False, metavar='True|False') parser.add_argument( '--compute-pagerank', dest='compute_pagerank', help='Whether or not to compute the PageRank property. ' + '\nNote: --undirected improves the pagerank calculation. ' + 'If you want both pagerank and in/out-degrees, you should make two runs. ' + '\n(default=%(default)s)', type=optional_bool, nargs='?', const=True, default=True, metavar='True|False') parser.add_argument( '--compute-hits', dest='compute_hits', help='Whether or not to compute the HITS properties. ' + '\nNote: --undirected disables HITS calculation. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=True, metavar='True|False') parser.add_argument( '--output-statistics-only', dest='output_statistics_only', help= 'If this option is set, write only the statistics edges to the primary output file. ' + 'Else, write both the statistics and the original graph. (default=%(default)s', type=optional_bool, nargs='?', const=True, default=False, metavar='True|False') parser.add_argument( '--output-degrees', dest='output_degrees', help= 'Whether or not to write degree edges to the primary output file. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=True, metavar='True|False') parser.add_argument( '--output-pagerank', dest='output_pagerank', help= 'Whether or not to write pagerank edges to the primary output file. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=True, metavar='True|False') parser.add_argument( '--output-hits', dest='output_hits', help= 'Whether or not to write HITS edges to the primary output file. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=True, metavar='True|False') parser.add_argument( '--log-file', action='store', type=str, dest='log_file', help='Summary file for the global statistics of the graph.', default='./summary.txt') parser.add_argument( '--log-top-relations', dest='log_top_relations', help= 'Whether or not to compute top relations and output them to the log file. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=True, metavar='True|False') parser.add_argument( '--log-degrees-histogram', dest='log_degrees_histogram', help= 'Whether or not to compute degree distribution and output it to the log file. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=True, metavar='True|False') parser.add_argument( '--log-top-pageranks', dest='log_top_pageranks', help= 'Whether or not to output PageRank centrality top-n to the log file. ' + '\n(default=%(default)s)', type=optional_bool, nargs='?', const=True, default=True, metavar='True|False') parser.add_argument( '--log-top-hits', dest='log_top_hits', help= 'Whether or not to output the top-n HITS to the log file. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=True, metavar='True|False') parser.add_argument( '--log-top-n', action='store', dest='top_n', default=5, type=int, help= 'Number of top centrality nodes to write to the log file. (default=%(default)d)' ) parser.add_argument( '--vertex-in-degree-property', action='store', dest='vertex_in_degree', default='vertex_in_degree', help='Label for edge: vertex in degree property. ' + '\nNote: If --undirected is True, then the in-degree will be 0. ' + '\n(default=%(default)s') parser.add_argument( '--vertex-out-degree-property', action='store', dest='vertex_out_degree', default='vertex_out_degree', help='Label for edge: vertex out degree property. ' + '\nNote: if --undirected is True, the the out-degree will be the sum of ' + 'the values that would have been calculated for in-degree and -out-degree ' + ' if --undirected were False. ' + '\n(default=%(default)s)') parser.add_argument( '--page-rank-property', action='store', dest='vertex_pagerank', default='vertex_pagerank', help='Label for pank rank property. (default=%(default)s)') parser.add_argument( '--vertex-hits-authority-property', action='store', dest='vertex_auth', default='vertex_auth', help='Label for edge: vertext hits authority. (default=%(default)s)') parser.add_argument( '--vertex-hits-hubs-property', action='store', dest='vertex_hubs', default='vertex_hubs', help='Label for edge: vertex hits hubs. (default=%(default)s)') KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments( parser, mode_options=True, default_mode=KgtkReaderMode[parsed_shared_args._mode], expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ # import modules locally from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions _expert: bool = parsed_shared_args._expert # This helper function makes it easy to suppress options from # The help message. The options are still there, and initialize # what they need to initialize. def h(msg: str) -> str: if _expert: return msg else: return SUPPRESS parser.add_input_file() parser.add_output_file() parser.add_argument("--output-format", dest="output_format", help=h("The file format (default=kgtk)"), type=str) parser.add_argument( '-c', "--columns", dest="column_names", nargs='*', metavar="COLUMN_NAME", help= "The list of source column names, optionally containing '..' for column ranges " + "and '...' for column names not explicitly mentioned.") parser.add_argument( "--into", dest="into_column_names", help="The name of the column to receive the result of the calculation.", required=True, nargs="+") parser.add_argument("--do", dest="operation", help="The name of the operation.", required=True, choices=OPERATIONS) parser.add_argument("--values", dest="values", nargs='*', metavar="VALUES", help="An optional list of values") parser.add_argument("--format", dest="format_string", help="The format string for the calculation.") KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def main(): """ Test the KGTK compact processor. """ parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="input_file_path", help="The KGTK file with the input data (default=%(default)s)", type=Path, nargs="?", default="-") parser.add_argument( "--columns", dest="key_column_names", help="The key columns to identify records for compaction. " + "(default=id for node files, (node1, label, node2, id) for edge files).", nargs='+', default=[ ]) parser.add_argument( "--compact-id", dest="compact_id", help="Indicate that the ID column in KGTK edge files should be compacted. " + "Normally, if the ID column exists, it is not compacted, " + "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--presorted", dest="sorted_input", help="Indicate that the input has been presorted (or at least pregrouped). (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--verify-sort", dest="verify_sort", help="If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--build-id", dest="build_id", help="Build id values in an id column. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkIdBuilderOptions.add_arguments(parser) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(args) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("input: %s" % str(args.input_file_path), file=error_file, flush=True) print("--columns %s" % " ".join(args.key_column_names), file=error_file, flush=True) print("--compact-id=%s" % str(args.compact_id), file=error_file, flush=True) print("--presorted=%s" % str(args.sorted_input), file=error_file, flush=True) print("--verify-sort=%s" % str(args.verify_sort), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True) print("--build-id=%s" % str(args.build_id), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) kc: KgtkCompact = KgtkCompact( input_file_path=args.input_file_path, key_column_names=args.key_column_names, compact_id=args.compact_id, sorted_input=args.sorted_input, verify_sort=args.verify_sort, output_file_path=args.output_file_path, build_id=args.build_id, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) kc.process()
def main(): """ Test the KGTK copy template. """ parser: ArgumentParser = ArgumentParser() parser.add_argument("-i", "--input-file", dest="input_file_path", help="The KGTK input file. (default=%(default)s)", type=Path, default="-") parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK output file. (default=%(default)s).", type=Path, default="-") parser.add_argument( "--keygen", dest="keygen", help="The KGTK key generation procedure. (default=%(default)s).", type=str, default="node1") parser.add_argument( "--group-sort", dest="group_sort", help="If true, use the grouped sort and buffer. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--group-iterate", dest="group_iterate", help="If true, us the grouped iteration. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("--input-files %s" % " ".join([str(path) for path in input_file_paths]), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True) print("--keygen=%s" % str(args.keygen), file=error_file, flush=True) print("--group-sort=%s" % str(args.group_sort), file=error_file, flush=True) print("--group-iterate=%s" % str(args.group_iterate), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) ksbt: KgtkSortBufferTest = KgtkSortBufferTest( input_file_path=args.input_file_path, output_file_path=args.output_file_path, keygen=args.keygen, group_sort=args.group_sort, group_iterate=args.group_iterate, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose, ) ksbt.process()
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): from kgtk.utils.argparsehelpers import optional_bool _expert: bool = parsed_shared_args._expert parser.accept_shared_argument('_debug') # input file # parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) parser.add_input_file(positional=True) # model name all_models_names = ALL_EMBEDDING_MODELS_NAMES parser.add_argument('-m', '--model', action='store', nargs='+', dest='all_models_names', default="bert-base-nli-cls-token", choices=all_models_names, help="the model to used for embedding") # parser.add_argument('-i', '--input', action='store', nargs='+', dest='input_uris', # help="input path", ) parser.add_argument( '-f', '--input-data-format', action='store', dest='data_format', choices=("test_format", "kgtk_format"), default="kgtk_format", help= "the input file format, could either be `test_format` or `kgtk_format`, default is `kgtk_format`", ) parser.add_argument( '-p', '--property-labels-file', action='store', nargs='+', dest='property_labels_file_uri', help="the path to the property labels file.", ) # This should probably default to "--label-properties" if not specified. parser.add_argument( '--property-labels-filter', action='store', nargs='+', dest='property_labels_filter', default=["label"], help= "The label columns value(s) of the edges to process in the property labels file. Default is [\"label\"]." ) # properties (only valid for kgtk format input/output data) parser.add_argument( '--label-properties', action='store', nargs='+', dest='label_properties', default=["label"], help= """The names of the edges for label properties, Default is ["label"]. \n This argument is only valid for input in kgtk format.""" ) parser.add_argument( '--description-properties', action='store', nargs='+', dest='description_properties', default=["description"], help= """The names of the edges for description properties, Default is ["description"].\n This argument is only valid for input in kgtk format.""" ) parser.add_argument( '--isa-properties', action='store', nargs='+', dest='isa_properties', default=["P31"], help= """The names of the edges for `isa` properties, Default is ["P31"] (the `instance of` node in wikidata).""") parser.add_argument( '--has-properties', action='store', nargs='+', dest='has_properties', default=[], help= """The names of the edges for `has` properties, Default is ["all"] (will automatically append all properties found for each node).""") parser.add_argument( '--property-value', action='store', nargs='+', dest='property_values', default=[], help= """For those edges found in `has` properties, the nodes specified here will display with corresponding edge(property) values. instead of edge name. """ ) parser.add_argument( '--property-value-file', action='store', dest='property_values_file', help= """Read the properties for --property-value option from an KGTK edge file""" ) parser.add_argument( '--output-property', action='store', dest='output_properties', default="text_embedding", help= """The output property name used to record the embedding. Default is `output_properties`. \n This argument is only valid for output in kgtk format.""" ) # output parser.add_argument( '--save-embedding-sentence', action='store_true', dest='save_embedding_sentence', help="if set, will also save the embedding sentences to output.") parser.add_argument( '-o', '--embedding-projector-metadata-path', action='store', dest='output_uri', default="", help= "output path for the metadata file, default will be current user's home directory" ) parser.add_argument( '--output-data-format', action='store', dest='output_data_format', default="kgtk_format", choices=("tsv_format", "kgtk_format"), help= "output format, can either be `tsv_format` or `kgtk_format`. \nIf choose `tsv_format`, the output " "will be a tsv file, with each row contains only the vector representation of a node. Each " "dimension is separated by a tab") parser.add_argument( '--embedding-projector-metadata', action='store', nargs='+', dest='metadata_properties', default=[], help= """list of properties used to construct a metadata file for use in the Google Embedding Projector: http://projector.tensorflow.org. \n Default: the label and description of each node.""" ) # black list file parser.add_argument( '-b', '--black-list', nargs='+', action='store', dest='black_list_files', default=[], help= "the black list file, contains the Q nodes which should not consider as candidates." ) # dimensional reduction relate parser.add_argument( "--dimensional-reduction", nargs='?', action='store', default="none", dest="dimensional_reduction", choices=("pca", "tsne", "none"), help= 'whether to run dimensional reduction algorithm or not after the embedding, default is None (not ' 'run). ') parser.add_argument( "--dimension", type=int, nargs='?', action='store', default=2, dest="dimension_val", help= 'How many dimension should remained after reductions, only valid when set to run dimensional ' 'reduction, default value is 2 ') parser.add_argument( "--parallel", nargs='?', action='store', default="1", dest="parallel_count", help="How many processes to be run in same time, default is 1.") # cache config parser.add_argument( "--use-cache", type=optional_bool, nargs='?', action='store', default=False, dest="use_cache", help= "whether to use cache to get some embedding vectors quicker, default is False" ) parser.add_argument( "--cache-host", nargs='?', action='store', default="dsbox01.isi.edu", dest="cache_host", help="cache host address, default is `dsbox01.isi.edu`") parser.add_argument("--cache-port", nargs='?', action='store', default="6379", dest="cache_port", help="cache server port, default is `6379`") # query server parser.add_argument( "--query-server", nargs='?', action='store', default="", dest="query_server", help= "sparql query endpoint used for test_format input files, default is " "https://query.wikidata.org/sparql ") KgtkReader.add_debug_arguments(parser, expert=False) KgtkReaderOptions.add_arguments( parser, mode_options=True, default_mode=KgtkReaderMode[parsed_shared_args._mode], expert=_expert) KgtkValueOptions.add_arguments(parser, expert=False)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ _expert: bool = parsed_shared_args._expert # This helper function makes it easy to suppress options from # The help message. The options are still there, and initialize # what they need to initialize. def h(msg: str) -> str: if _expert: return msg else: return SUPPRESS parser.add_argument( "input_kgtk_file", nargs="?", type=Path, default="-", help= "The KGTK file to filter. May be omitted or '-' for stdin (default=%(default)s)." ) parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--reject-file", dest="reject_kgtk_file", help= "The KGTK file into which to write rejected records (default=%(default)s).", type=Path, default=None) parser.add_argument( "--column", dest="column_name", help="The name of the column to explode. (default=%(default)s).", default=KgtkFormat.NODE2) parser.add_argument( "--prefix", dest="prefix", help="The prefix for exploded column names. (default=%(default)s).", default=KgtkFormat.NODE2 + ";" + KgtkFormat.KGTK_NAMESPACE) parser.add_argument( "--types", dest="type_names", nargs='*', help= "The KGTK data types for which fields should be imploded. (default=%(default)s).", choices=KgtkFormat.DataType.choices(), default=KgtkFormat.DataType.choices()) parser.add_argument( "--without", dest="without_fields", nargs='*', help="The KGTK fields to do without. (default=%(default)s).", choices=KgtkValueFields.OPTIONAL_DEFAULT_FIELD_NAMES, default=None) parser.add_argument( "--overwrite", dest="overwrite_column", help= "Indicate that it is OK to overwrite an existing imploded column. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--validate", dest="validate", help="Validate imploded values. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--escape-pipes", dest="escape_pipes", help= "When true, pipe characters (|) need to be escaped (\\|) per KGTK file format. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--quantities-include-numbers", dest="quantities_include_numbers", help= "When true, numbers are acceptable quantities. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--general-strings", dest="general_strings", help= "When true, strings may include language qualified strings. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--remove-prefixed-columns", dest="remove_prefixed_columns", help= "When true, remove all columns beginning with the prefix from the output file. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--ignore-unselected-types", dest="ignore_unselected_types", help= "When true, input records with valid but unselected data types will be passed through to output. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--retain-unselected-types", dest="retain_unselected_types", help= "When true, input records with valid but unselected data types will be retain existing data on output. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--build-id", dest="build_id", help="Build id values in an id column. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--show-data-types", dest="show_data_types", help="Print the list of data types and exit. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkIdBuilderOptions.add_arguments(parser) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def main(): """ Test the KGTK ID builder. """ parser: ArgumentParser = ArgumentParser() parser.add_argument( dest="input_file_path", help="The KGTK file with the input data (default=%(default)s)", type=Path, nargs="?", default="-") parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") KgtkIdBuilderOptions.add_arguments(parser) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args( args) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("input: %s" % str(args.input_file_path), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) # First create the KgtkReader. It provides parameters used by the ID # column builder. Next, create the ID column builder, which provides a # possibly revised list of column names for the KgtkWriter. Last, create # the KgtkWriter. # Open the input file. kr: KgtkReader = KgtkReader.open( args.input_file_path, error_file=error_file, options=reader_options, value_options=value_options, verbose=args.verbose, very_verbose=args.very_verbose, ) # Create the ID builder. idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options) # Open the output file. ew: KgtkWriter = KgtkWriter.open(idb.column_names, args.output_file_path, mode=kr.mode, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=args.verbose, very_verbose=args.very_verbose) # Process the input file, building IDs. idb.process(kr, ew)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ from kgtk.iff.kgtkifexists import KgtkIfExists from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions _expert: bool = parsed_shared_args._expert # This helper function makes it easy to suppress options from # The help message. The options are still there, and initialize # what they need to initialize. def h(msg: str) -> str: if _expert: return msg else: return SUPPRESS parser.add_input_file(positional=True) parser.add_input_file(who="The KGTK file to filter against.", options=["--filter-on", "--filter-file"], dest="filter_file", metavar="FILTER_FILE") parser.add_output_file() parser.add_output_file( who="The KGTK reject file for records that fail the filter.", dest="reject_file", options=["--reject-file"], metavar="REJECT_FILE", optional=True) parser.add_output_file( who= "The KGTK file for filter records that matched at least one input record.", dest="matched_filter_file", options=["--matched-filter-file"], metavar="MATCHED_FILTER_FILE", optional=True) parser.add_output_file( who= "The KGTK file for filter records that did not match any input records.", dest="unmatched_filter_file", options=["--unmatched-filter-file"], metavar="UNMATCHED_FILTER_FILE", optional=True) parser.add_output_file( who=h("The KGTK file for joined output records (EXPERIMENTAL)."), dest="join_file", options=["--join-file"], metavar="JOIN_FILE", optional=True) parser.add_argument( "--input-keys", "--left-keys", dest="input_keys", help="The key columns in the file being filtered (default=None).", nargs='*') parser.add_argument( "--filter-keys", "--right-keys", dest="filter_keys", help="The key columns in the filter-on file (default=None).", nargs='*') parser.add_argument( "--cache-input", dest="cache_input", metavar="True|False", help= "Cache the input file instead of the filter keys (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--preserve-order", dest="preserve_order", metavar="True|False", help= "Preserve record order when cacheing the input file. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--presorted", dest="presorted", metavar="True|False", help= "When True, assume that the input and filter files are both presorted. Use a merge-style algorithm that does not require caching either file. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--field-separator", dest="field_separator", help=h("Separator for multifield keys (default=%(default)s)"), default=KgtkIfExists.FIELD_SEPARATOR_DEFAULT) parser.add_argument( "--left-join", dest="left_join", metavar="True|False", help= h("When True, Include all input records in the join (EXPERIMENTAL). (default=%(default)s)." ), type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--right-join", dest="right_join", metavar="True|False", help= h("When True, Include all filter records in the join (EXPERIMENTAL). (default=%(default)s)." ), type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--input-prefix", dest="input_prefix", help= h("Input file column name prefix for joins (EXPERIMENTAL). (default=%(default)s)" )) parser.add_argument( "--filter-prefix", dest="filter_prefix", help= h("Filter file column name prefix for joins (EXPERIMENTAL). (default=%(default)s)" )) parser.add_argument( "--join-output", dest="join_output", metavar="True|False", help= h("When True, send the join records to the main output (EXPERIMENTAL). (default=%(default)s)." ), type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--right-join-first", dest="right_first", metavar="True|False", help= h("When True, send the filter record to join output before the first matching input record. " + " Otherwise, send the first matching input record, then the filter record, then othe rmatching input records. " + "(EXPERIMENTAL). (default=%(default)s)."), type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert, defaults=False) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter", expert=_expert, defaults=False) KgtkValueOptions.add_arguments(parser, expert=_expert)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions _expert: bool = parsed_shared_args._expert parser.add_input_file(positional=True, who="The KGTK file to find connected components in.") parser.add_output_file() # parser.add_argument(action="store", type=str, dest="filename", metavar='filename', help='input filename here') # parser.add_argument('-o', '--out', action='store', type=str, dest='output', help='File to output the reachable nodes,if empty will be written out to standard output',default=None) parser.add_argument( '--root', action='store', dest='root', type=str, nargs="*", help= 'Set of root nodes to use, space- or comma-separated strings. (default=None)' ) parser.add_argument( '--root-file', '--rootfile', action='store', dest='rootfile', help='Option to specify a file containing the set of root nodes', default=None) parser.add_argument( '--rootfilecolumn', action='store', type=str, dest='rootfilecolumn', help= 'Specify the name or number of the root file column with the root nodes. (default=node1 or its alias if edge file, id if node file)' ) parser.add_argument( "--subj", action="store", type=str, dest="subject_column_name", help='Name of the subject column. (default: node1 or its alias)') parser.add_argument( "--obj", action="store", type=str, dest="object_column_name", help='Name of the object column. (default: label or its alias)') parser.add_argument( "--pred", action="store", type=str, dest="predicate_column_name", help='Name of the predicate column. (default: node2 or its alias)') parser.add_argument( "--prop", "--props", action="store", type=str, dest="props", nargs="*", help= 'Properties to consider while finding reachable nodes, space- or comma-separated string. (default: all properties)', default=None) parser.add_argument( '--props-file', action='store', dest='props_file', help='Option to specify a file containing the set of properties', default=None) parser.add_argument( '--propsfilecolumn', action='store', type=str, dest='propsfilecolumn', default=None, help= 'Specify the name or number of the props file column with the property names. (default=node1 or its alias if edge file, id if node file)' ) parser.add_argument( '--inverted', dest="inverted", help= "When True, and when --undirected is False, invert the source and target nodes in the graph. (default=%(default)s)", type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( "--inverted-prop", "--inverted-props", action="store", type=str, dest="inverted_props", nargs="*", help= 'Properties to invert, space- or comma-separated string. (default: no properties)', default=None) parser.add_argument( '--inverted-props-file', action='store', dest='inverted_props_file', help= 'Option to specify a file containing the set of inverted properties', default=None) parser.add_argument( '--invertedpropsfilecolumn', action='store', type=str, dest='invertedpropsfilecolumn', default=None, help= 'Specify the name or number of the inverted props file column with the property names. (default=node1 or its alias if edge file, id if node file)' ) parser.add_argument( '--undirected', dest="undirected", help="When True, specify graph as undirected. (default=%(default)s)", type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( "--undirected-prop", "--undirected-props", action="store", type=str, dest="undirected_props", nargs="*", help= 'Properties to treat as undirected, space- or comma-separated string. (default: no properties)', default=None) parser.add_argument( '--undirected-props-file', action='store', dest='undirected_props_file', help= 'Option to specify a file containing the set of undirected properties', default=None) parser.add_argument( '--undirectedpropsfilecolumn', action='store', type=str, dest='undirectedpropsfilecolumn', default=None, help= 'Specify the name or number of the undirected props file column with the property names. (default=node1 or its alias if edge file, id if node file)' ) parser.add_argument( '--label', action='store', type=str, dest='label', help='The label for the reachable relationship. (default: %(default)s)', default="reachable") parser.add_argument( '--selflink', dest='selflink_bool', help= 'When True, include a link from each output node to itself. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( '--show-properties', dest='show_properties', help='When True, show the graph properties. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( '--breadth-first', dest='breadth_first', help= 'When True, search the graph breadth first. When false, search depth first. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( '--depth-limit', dest='depth_limit', help= 'An optional depth limit for breadth-first searches. (default=%(default)s)', type=int, default=None) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert, defaults=False) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="root", expert=_expert, defaults=False) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="props", expert=_expert, defaults=False) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="undirected_props", expert=_expert, defaults=False) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="inverted_props", expert=_expert, defaults=False) KgtkValueOptions.add_arguments(parser, expert=_expert)
def add_arguments(parser: KGTKArgumentParser): """ Parse arguments Args: parser (argparse.ArgumentParser) """ # import modules locally from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions ### IO parser.add_argument("-i", "--input-file", dest="input_file_path", help="The KGTK input file. (default=%(default)s)", type=Path, default="-") parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK output file. (default=%(default)s).", type=Path, default="-") parser.add_argument('-l', "--log", dest="log_file_path", help="Setting the log path [Default: None]", type=Path, default=None, metavar="") parser.add_argument( '-T', '--temporary_directory', dest='temporary_directory', help="Sepecify the directory location to store temporary file", type=Path, default=Path('/tmp/'), metavar='') parser.add_argument( '-ot', '--output_format', dest='output_format', help= "Outputformat for embeddings [Default: w2v] Choice: kgtk | w2v | glove", default='w2v', metavar='') parser.add_argument( '-r', '--retain_temporary_data', dest='retain_temporary_data', help= "When opearte graph, some tempory files will be generated, set True to retain these files ", type=bool, default=True, metavar='True|False') ### Training parameters parser.add_argument( '-d', "--dimension", dest="dimension_num", help="Dimension of the real space the embedding live in [Default: 100]", type=int, default=100, metavar="") parser.add_argument( '-s', "--init_scale", dest="init_scale", help= "Generating the initial embedding with this standard deviation [Default: 0.001]" + "If no initial embeddings are provided, they are generated by sampling each dimension" + "from a centered normal distribution having this standard deviation.", type=float, default=0.001, metavar="") parser.add_argument( '-c', '--comparator', dest='comparator', help= "How the embeddings of the two sides of an edge (after having already " + "undergone some processing) are compared to each other to produce a score[Default: dot]," + "Choice: dot|cos|l2|squared_l2", default='dot', choices=['dot', 'cos', 'l2', 'squared_l2'], metavar='dot|cos|l2|squared_l2') parser.add_argument( '-op', '--operator', dest='operator', help= "The transformation to apply to the embedding of one of the sides of the edge " + "(typically the right-hand one) before comparing it with the other one. It reflects which model that embedding uses. " + "[Default:ComplEx]", #default will be setting to complex_diagonal later default='ComplEx', metavar='RESCAL|DistMult|ComplEx|TransE') parser.add_argument( '-e', '--num_epochs', dest='num_epochs', help= "The number of times the training loop iterates over all the edges.[Default:100]", type=int, default=100, metavar='') parser.add_argument( '-b', '--bias', dest='bias', help= "Whether use the bias choice [Default: False],If enabled, withhold the first " + "dimension of the embeddings from the comparator and instead use it as a bias, adding " + "back to the score. Makes sense for logistic and softmax loss functions. ", type=bool, default=False, metavar='True|False') parser.add_argument( '-w', '--workers', dest='workers', help= "The number of worker processes for training. If not given, set to CPU count.", type=int, default=None, metavar='') parser.add_argument('-bs', '--batch_size', dest='batch_size', help="The number of edges per batch.[Default:1000]", type=int, default=1000, metavar='') parser.add_argument( '-lf', '--loss_fn', dest='loss_fn', help= "How the scores of positive edges and their corresponding negatives " + "are evaluated.[Default: ranking], Choice: ranking|logistic|softmax", # default will be setting to ranking later default=None, choices=['ranking', 'logistic', 'softmax', None], metavar='ranking|logistic|softmax') parser.add_argument( '-lr', '--learning_rate', dest='learning_rate', help="The learning rate for the optimizer.[Default: 0.1]", # default will be setting to 0.1 later type=float, default=None, metavar='') parser.add_argument( '-ef', '--eval_fraction', dest='eval_fraction', help= "The fraction of edges withheld from training and used to track evaluation " + "metrics during training. [Defalut:0.0 training all edges ]", type=float, default=0.0, metavar='') parser.add_argument( '-dr', '--dynamic_relaitons', dest='dynamic_relaitons', help="Whether use dynamic relations (when graphs with a " + "large number of relations) [Default: True]", type=bool, default=True, metavar='True|False') parser.add_argument( '-ge', '--global_emb', dest='global_emb', help= "Whether use global embedding, if enabled, add to each embedding a vector that is common " "to all the entities of a certain type. This vector is learned during training.[Default: False] ", type=bool, default=False, metavar='True|False') ### kgtk format KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True) KgtkValueOptions.add_arguments(parser)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.join.unique import Unique from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions _expert: bool = parsed_shared_args._expert # This helper function makes it easy to suppress options from # The help message. The options are still there, and initialize # what they need to initialize. def h(msg: str) -> str: if _expert: return msg else: return SUPPRESS parser.add_input_file(positional=True) parser.add_output_file() parser.add_argument( "--column", dest="column_name", help="The column to count unique values (default=node2 or its alias).") parser.add_argument( "--empty", dest="empty_value", help="A value to substitute for empty values (default=%(default)s).", default="") parser.add_argument( "--label", dest="label_value", help="The output file label column value (default=%(default)s).", default="count") # TODO: use an emum parser.add_argument( "--format", dest="output_format", help="The output file format and mode (default=%(default)s).", default=Unique.DEFAULT_FORMAT, choices=Unique.OUTPUT_FORMATS) parser.add_argument("--prefix", dest="prefix", help="The value prefix (default=%(default)s).", default="") parser.add_argument( "--where", dest="where_column_name", help= "The name of a column for a record selection test. (default=%(default)s).", default=None) parser.add_argument( "--in", dest="where_values", nargs="+", help= "The list of values for a record selection test. (default=%(default)s).", default=None) parser.add_argument( "--presorted", dest="presorted", metavar="True|False", help="When True, the input file is presorted. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def main(): """ Test the KGTK unique processor. """ parser: ArgumentParser = ArgumentParser() parser.add_argument(dest="input_file_path", help="The KGTK file with the input data", type=Path, nargs="?") parser.add_argument("--column", dest="column_name", help="The column to count unique values (required).", required=True) parser.add_argument( "--empty", dest="empty_value", help="A value to substitute for empty values (default=%(default)s).", default="") parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") parser.add_argument( "--label", dest="label_value", help="The output file label column value (default=%(default)s).", default="count") # TODO: use an enum parser.add_argument( "--format", dest="output_format", help="The output file format and mode (default=%(default)s).", default="edge", choices=["edge", "node"]) parser.add_argument("--prefix", dest="prefix", help="The value prefix (default=%(default)s).", default="") KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("input: %s" % (str(args.input_file_path) if args.input_file_path is not None else "-"), file=error_file) print("--column=%s" % args.column_name, file=error_file) print("--empty=%s" % args.empty_value, file=error_file) print("--output-file=%s" % str(args.output_file_path), file=error_file) print("--label=%s" % args.label_value, file=error_file) print("--format=%s" % args.output_format, file=error_file) print("--prefix=%s" % args.prefix, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) uniq: Unique = Unique(input_file_path=args.input_file_path, column_name=args.column_name, output_file_path=args.output_file_path, empty_value=args.empty_value, label_value=args.label_value, output_format=args.output_format, prefix=args.prefix, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose) uniq.process()
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ _expert: bool = parsed_shared_args._expert parser.add_input_file(positional=True) parser.add_output_file() parser.add_argument( '-c', "--columns", action="store", type=str, dest="columns", nargs='+', required=True, help= "Columns to remove as a comma- or space-separated strings, e.g., id,docid or id docid" ) parser.add_argument( "--split-on-commas", dest="split_on_commas", help= "Parse the list of columns, splitting on commas. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) parser.add_argument( "--split-on-spaces", dest="split_on_spaces", help= "Parse the list of columns, splitting on spaces. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) parser.add_argument( "--strip-spaces", dest="strip_spaces", help= "Parse the list of columns, stripping whitespace. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=True) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, default_mode=KgtkReaderMode.NONE, expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions _expert: bool = parsed_shared_args._expert parser.add_input_file(positional=True) parser.add_output_file() parser.add_input_file(who="KGTK file with path start and end nodes.", options=["--path-file", "--path_file"], dest="path_file", metavar="PATH_FILE", optional=False) parser.add_argument( '--statistics-only', dest='statistics_only', help= 'If this flag is set, output only the statistics edges. Else, append the statistics to the original graph. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument( '--undirected', dest="undirected", help="Is the graph undirected or not? (default=%(default)s)", type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") parser.add_argument('--max-hops', '--max_hops', action="store", type=int, dest="max_hops", help="Maximum number of hops allowed.") parser.add_argument( "--path-source", action="store", type=str, dest="source_column_name", help= 'Name of the source column in the path file. (default: node1 or its alias)' ) parser.add_argument( "--path-target", action="store", type=str, dest="target_column_name", help= 'Name of the source column in the path file. (default: node2 or its alias)' ) parser.add_argument( "--shortest-path", dest="shortest_path", metavar="True|False", help="When true, shortest paths are returned. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=False) KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments( parser, mode_options=True, default_mode=KgtkReaderMode[parsed_shared_args._mode], expert=_expert) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input", expert=_expert, defaults=False) KgtkReaderOptions.add_arguments(parser, mode_options=True, who="path", expert=_expert, defaults=False) KgtkValueOptions.add_arguments(parser, expert=_expert)
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace): """ Parse arguments Args: parser (argparse.ArgumentParser) """ # Import modules thay we will use when declaring arguments. from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode from kgtk.utils.argparsehelpers import optional_bool from kgtk.value.kgtkvalueoptions import KgtkValueOptions # These special shared aruments inticate whether the `--expert` option # was supplied and the command name that was used. _expert: bool = parsed_shared_args._expert _command: str = parsed_shared_args._command # This helper function makes it easy to suppress options from the help # message unless `--expert` has bee asserted. The options are still # there, and initialize what they need to initialize. def h(msg: str) -> str: if _expert: return msg else: return SUPPRESS # Add the primary input and output files without special features. parser.add_input_file() parser.add_output_file() # The default value for this option depends upon the command used. parser.add_argument( '-l', '--lines', dest="count_records", metavar="True/False", help="If true, count records and print a single number to stdout. " + "If false, count non-empty values per column and produce a simple KGTK output file. (default=%(default)s).", type=optional_bool, nargs='?', const=True, default=DEFAULT_COUNT_RECORDS_WC if _command == WC_COMMAND else DEFAULT_COUNT_RECORDS) # This is an expert option. It will not show up on `--help` without `--expert`: parser.add_argument( "--count-property", dest="count_property", help= h("The property used for column count output edges. (default=%(default)s)." ), default=DEFAULT_COUNT_PROPERTY) # Add the standard debugging arguments and the KgtkReader and KgtkValue # options. KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments( parser, mode_options=True, default_mode=KgtkReaderMode[parsed_shared_args._mode], expert=_expert) KgtkValueOptions.add_arguments(parser, expert=_expert)
def main(): """ Test the KGTK copy template. """ parser: ArgumentParser = ArgumentParser() parser.add_argument("-i", "--input-file", dest="input_file_path", help="The KGTK input file. (default=%(default)s)", type=Path, default="-") parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK output file. (default=%(default)s).", type=Path, default="-") parser.add_argument( "--reified-file", dest="reified_file_path", help= "A KGTK output file that will contain only the reified values. (default=%(default)s).", type=Path, default=None) parser.add_argument( "--unreified-file", dest="unreified_file_path", help= "A KGTK output file that will contain only the unreified values. (default=%(default)s).", type=Path, default=None) parser.add_argument( "--uninvolved-file", dest="uninvolved_file_path", help= "A KGTK output file that will contain only the uninvolved input records. (default=%(default)s).", type=Path, default=None) parser.add_argument("--output-format", dest="output_format", help="The file format (default=kgtk)", type=str, choices=KgtkWriter.OUTPUT_FORMAT_CHOICES) KgtkUnreifyValues.add_arguments(parser) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=False, expert=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("--input-files %s" % " ".join([str(path) for path in input_file_paths]), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True) if args.reified_file_path is not None: print("--reified-file=%s" % str(args.reified_file_path), file=error_file, flush=True) if args.unreified_file_path is not None: print("--unreified-file=%s" % str(args.unreified_file_path), file=error_file, flush=True) if args.uninvolved_file_path is not None: print("--uninvolved-file=%s" % str(args.uninvolved_file_path), file=error_file, flush=True) if args.output_format is not None: print("--output-format=%s" % args.output_format, file=error_file, flush=True) if args.trigger_label_value is not None: print("--trigger-label=%s" % args.trigger_label_value, file=error_file, flush=True) if args.trigger_node2_value is not None: print("--trigger-node2=%s" % args.trigger_node2_value, file=error_file, flush=True) if args.value_label_value is not None: print("--value-label=%s" % args.value_label_value, file=error_file, flush=True) if args.old_label_value is not None: print("--old-label=%s" % args.old_label_value, file=error_file, flush=True) if args.new_label_value is not None: print("--new-label=%s" % args.new_label_value, file=error_file, flush=True) print("--allow-multiple-values=%s" % str(args.allow_multiple_values), file=error_file, flush=True) print("--allow-extra-columns=%s" % str(args.allow_extra_columns), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) kuv: KgtkUnreifyValues = KgtkUnreifyValues( input_file_path=args.input_file_path, output_file_path=args.output_file_path, reified_file_path=args.reified_file_path, unreified_file_path=args.unreified_file_path, uninvolved_file_path=args.uninvolved_file_path, trigger_label_value=args.trigger_label_value, trigger_node2_value=args.trigger_node2_value, value_label_value=args.value_label_value, old_label_value=args.old_label_value, new_label_value=args.new_label_value, allow_multiple_values=args.allow_multiple_values, allow_extra_columns=args.allow_extra_columns, reader_options=reader_options, value_options=value_options, output_format=args.output_format, error_file=error_file, verbose=args.verbose, very_verbose=args.very_verbose, ) kuv.process()