Exemple #1
0
def add_arguments_extended(parser: KGTKArgumentParser,
                           parsed_shared_args: Namespace):
    """
    Parse arguments
    Args:
        parser (argparse.ArgumentParser)
    """
    # import modules locally
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    _expert: bool = parsed_shared_args._expert

    # This helper function makes it easy to suppress options from
    # The help message.  The options are still there, and initialize
    # what they need to initialize.
    def h(msg: str) -> str:
        if _expert:
            return msg
        else:
            return SUPPRESS

    parser.add_input_file(positional=True)
    parser.add_output_file()

    KgtkIdBuilderOptions.add_arguments(parser,
                                       expert=True)  # Show all the options.
    KgtkReader.add_debug_arguments(parser, expert=_expert)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert)
    KgtkValueOptions.add_arguments(parser, expert=_expert)
Exemple #2
0
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace):
    """
    Parse arguments
    Args:
        parser (argparse.ArgumentParser)
    """

    _expert: bool = parsed_shared_args._expert

    # This helper function makes it easy to suppress options from
    # The help message.  The options are still there, and initialize
    # what they need to initialize.
    def h(msg: str)->str:
        if _expert:
            return msg
        else:
            return SUPPRESS

    parser.add_argument(      "input_kgtk_file", nargs="?", type=Path, default="-",
                              help="The KGTK file to filter. May be omitted or '-' for stdin (default=%(default)s).")

    parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write (default=%(default)s).", type=Path, default="-")

    KgtkIdBuilderOptions.add_arguments(parser, expert=True) # Show all the options.
    KgtkReader.add_debug_arguments(parser, expert=_expert)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert)
    KgtkValueOptions.add_arguments(parser, expert=_expert)
Exemple #3
0
def add_arguments_extended(parser: KGTKArgumentParser,
                           parsed_shared_args: Namespace):
    """
    Parse arguments
    Args:
        parser (argparse.ArgumentParser)
    """
    from kgtk.imports.kgtkntriples import KgtkNtriples
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.utils.argparsehelpers import optional_bool
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    _expert: bool = parsed_shared_args._expert

    # This helper function makes it easy to suppress options from
    # The help message.  The options are still there, and initialize
    # what they need to initialize.
    def h(msg: str) -> str:
        if _expert:
            return msg
        else:
            return SUPPRESS

    parser.add_input_file(who="The ntriples file(s) to import.",
                          allow_list=True)
    parser.add_output_file()

    parser.add_output_file(
        who="The ntriples output file for records that are rejected.",
        dest="reject_file",
        options=["--reject-file"],
        metavar="REJECT_FILE",
        optional=True)

    parser.add_input_file(who="The KGTK input file with known namespaces.",
                          dest="namespace_file",
                          options=["--namespace-file"],
                          metavar="NAMESPACE_FILE",
                          optional=True)

    parser.add_output_file(who="The KGTK output file with updated namespaces.",
                           dest="updated_namespace_file",
                           options=["--updated-namespace-file"],
                           metavar="NAMESPACE_FILE",
                           optional=True)

    KgtkNtriples.add_arguments(parser)
    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser, expert=_expert)
    KgtkReaderOptions.add_arguments(
        parser,
        mode_options=True,
        default_mode=KgtkReaderMode[parsed_shared_args._mode],
        expert=_expert)
    KgtkValueOptions.add_arguments(parser)
Exemple #4
0
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace):
    """
    Parse arguments
    Args:
        parser (argparse.ArgumentParser)
    """

    _expert: bool = parsed_shared_args._expert

    # This helper function makes it easy to suppress options from
    # The help message.  The options are still there, and initialize
    # what they need to initialize.
    def h(msg: str)->str:
        if _expert:
            return msg
        else:
            return SUPPRESS

    parser.add_argument(      "input_kgtk_file", nargs="?", type=Path, default="-",
                              help="The KGTK file to filter. May be omitted or '-' for stdin (default=%(default)s).")

    parser.add_argument(      "--columns", dest="key_column_names",
                              help="The key columns to identify records for compaction. " +
                              "(default=id for node files, (node1, label, node2, id) for edge files).", nargs='+', default=[ ])

    parser.add_argument(      "--compact-id", dest="compact_id",
                              help="Indicate that the ID column in KGTK edge files should be compacted. " +
                              "Normally, if the ID column exists, it is not compacted, " +
                              "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--presorted", dest="sorted_input",
                              help="Indicate that the input has been presorted (or at least pregrouped) (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--verify-sort", dest="verify_sort",
                              help="If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=True)

    parser.add_argument("-o", "--output-file", dest="output_kgtk_file", help="The KGTK file to write (default=%(default)s).", type=Path, default="-")

    parser.add_argument(      "--build-id", dest="build_id",
                              help="Build id values in an id column. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)
    
    KgtkIdBuilderOptions.add_arguments(parser, expert=_expert)
    KgtkReader.add_debug_arguments(parser, expert=_expert)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert)
    KgtkValueOptions.add_arguments(parser, expert=_expert)
Exemple #5
0
def main():
    """
    Test the KGTK compact processor.

    TODO: Support the list output file.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(
        dest="input_file_path",
        help="The KGTK file with the input data (default=%(default)s)",
        type=Path,
        nargs="?",
        default="-")

    parser.add_argument(
        "--columns",
        dest="key_column_names",
        help="The key columns to identify records for compaction. " +
        "(default=id for node files, (node1, label, node2, id) for edge files).",
        nargs='+',
        default=[])

    parser.add_argument(
        "--keep-first",
        dest="keep_first_names",
        help=
        "If compaction results in a list of values for any column on this list, keep only the first value after sorting. "
        + "(default=none).",
        nargs='+',
        default=[])

    parser.add_argument(
        "--compact-id",
        dest="compact_id",
        help=
        "Indicate that the ID column in KGTK edge files should be compacted. "
        + "Normally, if the ID column exists, it is not compacted, " +
        "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--deduplicate",
        dest="deduplicate",
        help=
        "Treat all columns as key columns, overriding --columns and --compact-id. "
        +
        "This will remove completely duplicate records without compacting any new lists. "
        + "(default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    parser.add_argument(
        "--presorted",
        dest="sorted_input",
        help=
        "Indicate that the input has been presorted (or at least pregrouped). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--verify-sort",
        dest="verify_sort",
        help=
        "If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--lists-in-input",
        dest="lists_in_input",
        help=
        "Assume that the input file may contain lists (disable when certain it does not). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--report-lists",
        dest="report_lists",
        help=
        "When True, report records with lists to the error output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--exclude-lists",
        dest="exclude_lists",
        help=
        "When True, exclude records with lists from the output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--output-only-lists",
        dest="output_only_lists",
        help=
        "When True, output only records containing lists. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--build-id",
        dest="build_id",
        help="Build id values in an id column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        print("--columns %s" % " ".join(args.key_column_names),
              file=error_file,
              flush=True)
        print("--keep-first %s" % " ".join(args.keep_first_names),
              file=error_file,
              flush=True)
        print("--compact-id=%s" % str(args.compact_id),
              file=error_file,
              flush=True)
        print("--deduplicate=%s" % str(args.deduplicate),
              file=error_file,
              flush=True)
        print("--presorted=%s" % str(args.sorted_input),
              file=error_file,
              flush=True)
        print("--verify-sort=%s" % str(args.verify_sort),
              file=error_file,
              flush=True)
        print("--lists-in-input=%s" % str(args.lists_in_input),
              file=error_file,
              flush=True)
        print("--report-lists=%s" % str(args.report_lists),
              file=error_file,
              flush=True)
        print("--exclude-lists=%s" % str(args.exclude_lists),
              file=error_file,
              flush=True)
        print("--output-only-lists=%s" % str(args.output_only_lists),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(args.build_id),
              file=error_file,
              flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kc: KgtkCompact = KgtkCompact(input_file_path=args.input_file_path,
                                  key_column_names=args.key_column_names,
                                  keep_first_names=args.keep_first_names,
                                  compact_id=args.compact_id,
                                  deduplicate=args.deduplicate,
                                  sorted_input=args.sorted_input,
                                  verify_sort=args.verify_sort,
                                  lists_in_input=args.lists_in_input,
                                  report_lists=args.report_lists,
                                  exclude_lists=args.exclude_lists,
                                  output_only_lists=args.output_only_lists,
                                  output_file_path=args.output_file_path,
                                  build_id=args.build_id,
                                  idbuilder_options=idbuilder_options,
                                  reader_options=reader_options,
                                  value_options=value_options,
                                  error_file=error_file,
                                  verbose=args.verbose,
                                  very_verbose=args.very_verbose)

    kc.process()
Exemple #6
0
def main():
    """
    Test the KGTK ntriples importer.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(
        "-i",
        "--input-files",
        dest="input_file_paths",
        nargs='*',
        help="The file(s) with the input ntriples data. (default=%(default)s)",
        type=Path,
        default="-")

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--reject-file",
        dest="reject_file_path",
        help=
        "The KGTK file into which to write rejected records. (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument(
        "--namespace-file",
        dest="namespace_file_path",
        help="The KGTK file with known namespaces. (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument(
        "--updated-namespace-file",
        dest="updated_namespace_file_path",
        help=
        "An updated KGTK file with known namespaces. (default=%(default)s).",
        type=Path,
        default=None)

    KgtkNtriples.add_arguments(parser)
    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("--input-files %s" %
              " ".join([str(path) for path in input_file_paths]),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        # TODO: show ifempty-specific options.
        if args.reject_file_path is not None:
            print("--reject-file=%s" % str(args.reject_file_path),
                  file=error_file,
                  flush=True)
        if args.namespace_file_path is not None:
            print("--namespace-file=%s" % str(args.namespace_file_path),
                  file=error_file,
                  flush=True)
        if args.updated_namespace_file_path is not None:
            print("--updated-namespace-file=%s" %
                  str(args.updated_namespace_file_path),
                  file=error_file,
                  flush=True)
        print("--namespace-id-prefix %s" % args.namespace_id_prefix,
              file=error_file,
              flush=True)
        print("--namespace-id-use-uuid %s" % str(args.namespace_id_use_uuid),
              file=error_file,
              flush=True)
        print("--namespace-id-counter %s" % str(args.namespace_id_counter),
              file=error_file,
              flush=True)
        print("--namespace-id-zfill %s" % str(args.namespace_id_zfill),
              file=error_file,
              flush=True)
        print("--output-only-used-namespaces %s" %
              str(args.output_only_used_namespaces),
              file=error_file,
              flush=True)
        print("--allow-lax-uri %s" % str(args.allow_lax_uri),
              file=error_file,
              flush=True)
        print("--local-namespace-prefix %s" % args.local_namespace_prefix,
              file=error_file,
              flush=True)
        print("--local-namespace-use-uuid %s" %
              str(args.local_namespace_use_uuid),
              file=error_file,
              flush=True)
        print("--prefix-expansion-label %s" % args.prefix_expansion_label,
              file=error_file,
              flush=True)
        print("--structured-value-label %s" % args.structured_value_label,
              file=error_file,
              flush=True)
        print("--structured-uri-label %s" % args.structured_uri_label,
              file=error_file,
              flush=True)
        print("--newnode-prefix %s" % args.newnode_prefix,
              file=error_file,
              flush=True)
        print("--newnode-use-uuid %s" % str(args.newnode_use_uuid),
              file=error_file,
              flush=True)
        print("--newnode-counter %s" % str(args.newnode_counter),
              file=error_file,
              flush=True)
        print("--newnode-zfill %s" % str(args.newnode_zfill),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(args.build_id),
              file=error_file,
              flush=True)
        print("--escape-pipes=%s" % str(args.escape_pipes),
              file=error_file,
              flush=True)
        print("--validate=%s" % str(args.validate),
              file=error_file,
              flush=True)
        if args.override_uuid is not None:
            print("--override_uuid=%s" % str(args.override_uuid),
                  file=error_file,
                  flush=True)

        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kn: KgtkNtriples = KgtkNtriples(
        input_file_paths=args.input_file_paths,
        output_file_path=args.output_file_path,
        reject_file_path=args.reject_file_path,
        namespace_file_path=args.namespace_file_path,
        updated_namespace_file_path=args.updated_namespace_file_path,
        namespace_id_prefix=args.namespace_id_prefix,
        namespace_id_use_uuid=args.namespace_id_use_uuid,
        namespace_id_counter=args.namespace_id_counter,
        namespace_id_zfill=args.namespace_id_zfill,
        output_only_used_namespaces=args.output_only_used_namespaces,
        newnode_prefix=args.newnode_prefix,
        newnode_use_uuid=args.newnode_use_uuid,
        newnode_counter=args.newnode_counter,
        newnode_zfill=args.newnode_zfill,
        allow_lax_uri=args.allow_lax_uri,
        local_namespace_prefix=args.local_namespace_prefix,
        local_namespace_use_uuid=args.local_namespace_use_uuid,
        prefix_expansion_label=args.prefix_expansion_label,
        structured_value_label=args.structured_value_label,
        structured_uri_label=args.structured_uri_label,
        build_id=args.build_id,
        escape_pipes=args.escape_pipes,
        idbuilder_options=idbuilder_options,
        validate=args.validate,
        override_uuid=args.override_uuid,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    kn.process()
Exemple #7
0
def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Namespace):
    """
    Parse arguments
    Args:
        parser (argparse.ArgumentParser)
    """
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.utils.argparsehelpers import optional_bool
    from kgtk.value.kgtkvalue import KgtkValueFields
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    _expert: bool = parsed_shared_args._expert

    # This helper function makes it easy to suppress options from
    # The help message.  The options are still there, and initialize
    # what they need to initialize.
    def h(msg: str)->str:
        if _expert:
            return msg
        else:
            return SUPPRESS

    parser.add_input_file(positional=True)
    parser.add_output_file()
    parser.add_output_file(who="The KGTK file for records that are rejected.",
                           dest="reject_file",
                           options=["--reject-file"],
                           metavar="REJECT_FILE",
                           optional=True)

    parser.add_argument(      "--column", dest="column_name", help="The name of the column to explode. (default=%(default)s).", default=KgtkFormat.NODE2)

    parser.add_argument(      "--prefix", dest="prefix", help="The prefix for exploded column names. (default=%(default)s).",
                              default=KgtkFormat.NODE2 + ";" + KgtkFormat.KGTK_NAMESPACE)

    parser.add_argument(      "--types", dest="type_names", nargs='*',
                               help="The KGTK data types for which fields should be imploded. (default=%(default)s).",
                               choices=KgtkFormat.DataType.choices(),
                               default=KgtkFormat.DataType.choices())

    parser.add_argument(      "--without", dest="without_fields", nargs='*',
                               help="The KGTK fields to do without. (default=%(default)s).",
                               choices=KgtkValueFields.OPTIONAL_DEFAULT_FIELD_NAMES,
                               default=None)

    parser.add_argument(      "--overwrite", dest="overwrite_column",
                              help="Indicate that it is OK to overwrite an existing imploded column. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=True)

    parser.add_argument(      "--validate", dest="validate",
                              help="Validate imploded values. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=True)

    parser.add_argument(      "--escape-pipes", dest="escape_pipes",
                              help="When true, pipe characters (|) need to be escaped (\\|) per KGTK file format. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--quantities-include-numbers", dest="quantities_include_numbers",
                              help="When true, numbers are acceptable quantities. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=True)

    parser.add_argument(      "--general-strings", dest="general_strings",
                              help="When true, strings may include language qualified strings. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=True)

    parser.add_argument(      "--remove-prefixed-columns", dest="remove_prefixed_columns",
                              help="When true, remove all columns beginning with the prefix from the output file. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--ignore-unselected-types", dest="ignore_unselected_types",
                              help="When true, input records with valid but unselected data types will be passed through to output. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=True)

    parser.add_argument(      "--retain-unselected-types", dest="retain_unselected_types",
                              help="When true, input records with valid but unselected data types will be retain existing data on output. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=True)

    parser.add_argument(      "--build-id", dest="build_id",
                              help="Build id values in an id column. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--show-data-types", dest="show_data_types",
                              help="Print the list of data types and exit. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--quiet", dest="quiet",
                              help="When true, suppress certain complaints unless verbose. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser, expert=_expert)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert)
    KgtkValueOptions.add_arguments(parser, expert=_expert)
Exemple #8
0
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,

        column_name: str,
        prefix: str,
        type_names: typing.List[str],
        without_fields: typing.Optional[typing.List[str]],
        overwrite_column: bool,
        validate: bool,
        escape_pipes: bool,
        quantities_include_numbers: bool,
        general_strings: bool,
        remove_prefixed_columns: bool,
        ignore_unselected_types: bool,
        retain_unselected_types: bool,
        build_id: bool,
        show_data_types: bool,
        quiet: bool,
        
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.reshape.kgtkimplode import KgtkImplode
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(reject_file, who="KGTK reject file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(kwargs)    
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file, flush=True)

        print("--column %s" % column_name, file=error_file, flush=True)
        print("--prefix %s" % prefix, file=error_file, flush=True)
        print("--overwrite %s" % str(overwrite_column), file=error_file, flush=True)
        print("--validate %s" % str(validate), file=error_file, flush=True)
        print("--escape-pipes %s" % str(escape_pipes), file=error_file, flush=True)
        print("--quantities-include-numbers %s" % str(quantities_include_numbers), file=error_file, flush=True)
        print("--general-strings %s" % str(general_strings), file=error_file, flush=True)
        print("--remove-prefixed-columns %s" % str(remove_prefixed_columns), file=error_file, flush=True)
        print("--ignore-unselected-types %s" % str(ignore_unselected_types), file=error_file, flush=True)
        print("--retain-unselected-types %s" % str(retain_unselected_types), file=error_file, flush=True)
        if type_names is not None:
            print("--types %s" % " ".join(type_names), file=error_file, flush=True)
        if without_fields is not None:
            print("--without %s" % " ".join(without_fields), file=error_file, flush=True)
        print("--show-data-types %s" % str(show_data_types), file=error_file, flush=True)
        print("--quiet %s" % str(quiet), file=error_file, flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)
    if show_data_types:
        data_type: str
        for data_type in KgtkFormat.DataType.choices():
            print("%s" % data_type, file=error_file, flush=True)
        return 0

    wf: typing.List[str] = without_fields if without_fields is not None else list()

    try:
        ex: KgtkImplode = KgtkImplode(
            input_file_path=input_kgtk_file,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_kgtk_file,
            column_name=column_name,
            prefix=prefix,
            type_names=type_names,
            without_fields=wf,
            overwrite_column=overwrite_column,
            validate=validate,
            escape_pipes=escape_pipes,
            quantities_include_numbers=quantities_include_numbers,
            general_strings=general_strings,
            remove_prefixed_columns=remove_prefixed_columns,
            ignore_unselected_types=ignore_unselected_types,
            retain_unselected_types=retain_unselected_types,
            quiet=quiet,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Exemple #9
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        key_column_names: typing.List[str],
        compact_id: bool,
        sorted_input: bool,
        verify_sort: bool,
        build_id: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        print("--compact-id=%s" % str(compact_id), file=error_file, flush=True)
        print("--presorted=%s" % str(sorted_input))
        print("--verify-sort=%s" % str(verify_sort),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ex: KgtkCompact = KgtkCompact(
            input_file_path=input_kgtk_file,
            key_column_names=key_column_names,
            compact_id=compact_id,
            sorted_input=sorted_input,
            verify_sort=verify_sort,
            output_file_path=output_kgtk_file,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Exemple #10
0
def add_arguments_extended(parser: KGTKArgumentParser,
                           parsed_shared_args: Namespace):
    """
    Parse arguments
    Args:
        parser (argparse.ArgumentParser)
    """
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.utils.argparsehelpers import optional_bool
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    _command: str = parsed_shared_args._command
    _expert: bool = parsed_shared_args._expert

    # This helper function makes it easy to suppress options from
    # The help message.  The options are still there, and initialize
    # what they need to initialize.
    def h(msg: str) -> str:
        if _expert:
            return msg
        else:
            return SUPPRESS

    parser.add_input_file(positional=True)
    parser.add_output_file()
    parser.add_output_file(
        who=
        "A KGTK output file that will contain only the rows containing lists."
        + " This file will have the same columns as the primary output file.",
        dest="list_output_file",
        options=["--list-output-file"],
        metavar="LIST_OUTPUT_FILE",
        optional=True)

    if _command == DEDUP_COMMAND:

        parser.add_argument(
            "--columns",
            dest="key_column_names",
            help=h("The key columns to identify records for compaction. " +
                   "(default=all columns)."),
            nargs='+',
            default=[])

        parser.add_argument(
            "--compact-id",
            dest="compact_id",
            help=
            h("Indicate that the ID column in KGTK edge files should be compacted. "
              + "Normally, if the ID column exists, it is not compacted, " +
              "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s)."
              ),
            type=optional_bool,
            nargs='?',
            const=True,
            default=False,
            metavar="True|False")

        parser.add_argument(
            "--deduplicate",
            dest="deduplicate",
            help=
            h("Treat all columns as key columns, overriding --columns and --compact-id. "
              +
              "This will remove completely duplicate records without compacting any new lists. "
              + "(default=%(default)s)."),
            type=optional_bool,
            nargs='?',
            const=True,
            default=True,
            metavar="True|False")

        parser.add_argument(
            "--lists-in-input",
            dest="lists_in_input",
            help=
            h("Assume that the input file may contain lists (disable when certain it does not). (default=%(default)s)."
              ),
            type=optional_bool,
            nargs='?',
            const=True,
            default=True)

        parser.add_argument(
            "--keep-first",
            dest="keep_first_names",
            help=
            h("If compaction results in a list of values for any column on this list, keep only the first value after sorting. "
              + "(default=none)."),
            nargs='+',
            default=[])
    else:
        parser.add_argument(
            "--columns",
            dest="key_column_names",
            help="The key columns to identify records for compaction. " +
            "(default=id for node files, (node1, label, node2, id) for edge files).",
            nargs='+',
            default=[])

        parser.add_argument(
            "--compact-id",
            dest="compact_id",
            help=
            "Indicate that the ID column in KGTK edge files should be compacted. "
            + "Normally, if the ID column exists, it is not compacted, " +
            "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).",
            type=optional_bool,
            nargs='?',
            const=True,
            default=False,
            metavar="True|False")

        parser.add_argument(
            "--deduplicate",
            dest="deduplicate",
            help=
            "Treat all columns as key columns, overriding --columns and --compact-id. "
            +
            "This will remove completely duplicate records without compacting any new lists. "
            + "(default=%(default)s).",
            type=optional_bool,
            nargs='?',
            const=True,
            default=False,
            metavar="True|False")

        parser.add_argument(
            "--lists-in-input",
            dest="lists_in_input",
            help=
            "Assume that the input file may contain lists (disable when certain it does not). (default=%(default)s).",
            type=optional_bool,
            nargs='?',
            const=True,
            default=True)

        parser.add_argument(
            "--keep-first",
            dest="keep_first_names",
            help=
            "If compaction results in a list of values for any column on this list, keep only the first value after sorting. "
            + "(default=none).",
            nargs='+',
            default=[])

    parser.add_argument(
        "--presorted",
        dest="sorted_input",
        help=
        "Indicate that the input has been presorted (or at least pregrouped) (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    parser.add_argument(
        "--verify-sort",
        dest="verify_sort",
        help=
        "If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True,
        metavar="True|False")

    parser.add_argument(
        "--report-lists",
        dest="report_lists",
        help=
        "When True, report records with lists to the error output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--exclude-lists",
        dest="exclude_lists",
        help=
        "When True, exclude records with lists from the output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--output-only-lists",
        dest="output_only_lists",
        help=
        "When True, only records containing lists will be written to the primary output file. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--build-id",
        dest="build_id",
        help="Build id values in an id column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    KgtkIdBuilderOptions.add_arguments(parser, expert=_expert)
    KgtkReader.add_debug_arguments(parser, expert=_expert)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert)
    KgtkValueOptions.add_arguments(parser, expert=_expert)
Exemple #11
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        new_edges_file: KGTKFiles,
        base_columns: typing.Optional[typing.List[str]] = None,
        columns_to_lower: typing.Optional[typing.List[str]] = None,
        label_values: typing.Optional[typing.List[str]] = None,
        lift_separator: str = KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR,
        ignore_empty_node1: bool = False,
        ignore_empty_node2: bool = False,
        add_id: bool = False,
        lower: bool = False,
        normalize: bool = False,
        deduplicate_new_edges: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalue import KgtkValue
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    new_edges_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(new_edges_file,
                                                            who="Label file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if new_edges_kgtk_file is not None:
            print("--label-file=%s" % str(new_edges_kgtk_file),
                  file=error_file)

        if base_columns is not None:
            print("--base-columns %s" % " ".join(base_columns),
                  file=error_file)
        if columns_to_lower is not None:
            print("--columns-to-lower %s" % " ".join(columns_to_lower),
                  file=error_file)
        if label_values is not None:
            print("--label-values %s" % " ".join(label_values),
                  file=error_file)
        print("--lift-separator=%s" % lift_separator, file=error_file)
        print("--add-id=%s" % add_id, file=error_file)
        print("--lower=%s" % lower, file=error_file)
        print("--ignore-empty-node1=%s" % ignore_empty_node1, file=error_file)
        print("--ignore-empty-node2=%s" % ignore_empty_node2, file=error_file)
        print("--normalize=%s" % normalize, file=error_file)
        print("--deduplicate-labels=%s" % deduplicate_new_edges,
              file=error_file)

        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if not lower and not normalize:
        raise KGTKException(
            "One or both of --lower and --normalize must be requested.")

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Map the index of a column being removed to the index of the base column that supplies its node1 value.
        lower_map: typing.MutableMapping[int, typing.Tuple[int, str]] = dict()

        node1_column_name: str = kr.get_node1_column_actual_name()
        label_column_name: str = kr.get_label_column_actual_name()
        node2_column_name: str = kr.get_node2_column_actual_name()
        id_column_name: str = kr.get_id_column_actual_name()

        key_column_names: typing.List[str] = list()
        key_column_idxs: typing.Set[int] = set()

        if node1_column_name != "":
            if verbose:
                print("Node1 column name: %s" % node1_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node1_column_name)
            key_column_idxs.add(kr.node1_column_idx)

        if label_column_name != "":
            if verbose:
                print("Label column name: %s" % label_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(label_column_name)
            key_column_idxs.add(kr.label_column_idx)

        if node2_column_name != "":
            if verbose:
                print("Node2 column name: %s" % node2_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(node2_column_name)
            key_column_idxs.add(kr.node2_column_idx)

        if id_column_name != "":
            if verbose:
                print("Id column name: %s" % id_column_name,
                      file=error_file,
                      flush=True)
            key_column_names.append(id_column_name)
            key_column_idxs.add(kr.id_column_idx)
        elif normalize:
            raise KGTKException(
                "--normalize was requested but the ID column was not found.")

        base_name: str
        new_label_value: str
        column_name: str
        idx: int
        # There are three option patterns.

        if columns_to_lower is not None and len(
                columns_to_lower) > 0 and base_columns is not None and len(
                    base_columns) > 0:
            # Pattern 1: len(columns_to_lower) > 0 and len(base_columns) == len(columns_to_lower)
            # column_names and base_columns are paired. New records use label_values if specified.
            if len(columns_to_lower) != len(base_columns):
                raise KGTKException(
                    "There are %d columns to lower but only %d base columns." %
                    (len(columns_to_lower), len(base_columns)))

            if label_values is not None and len(label_values) > 0 and len(
                    label_values) != len(columns_to_lower):
                raise KGTKException(
                    "There are %d columns to lower but only %d label values." %
                    (len(columns_to_lower), len(label_values)))

            for idx, column_name in enumerate(columns_to_lower):
                base_name = base_columns[idx]
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if base_name not in kr.column_names:
                    raise KGTKException(
                        "For column name %s, base name %s is unknown" %
                        (repr(column_name), repr(base_name)))

                if normalize and base_name == id_column_name:
                    lower_map[kr.column_name_map[column_name]] = (
                        kr.column_name_map[base_name], column_name)
                else:
                    if not lower:
                        raise KGTKException(
                            "--lower is not enabled for column %s, base name %s"
                            % (repr(column_name), repr(base_name)))
                    if label_values is not None and len(
                            label_values) > 0 and len(label_values[idx]) > 0:
                        lower_map[kr.column_name_map[column_name]] = (
                            kr.column_name_map[base_name], label_values[idx])
                    else:
                        lower_map[kr.column_name_map[column_name]] = (
                            kr.column_name_map[base_name], column_name)

        elif columns_to_lower is not None and len(columns_to_lower) > 0 and (
                base_columns is None or len(base_columns) == 0):
            # Pattern 2: len(columns_to_lower) > 0 and len(base_columns) == 0
            # Each column name is split at the lift separator to determine the base name and label value.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            for idx, column_name in enumerate(columns_to_lower):
                if column_name not in kr.column_names:
                    raise KGTKException(
                        "Column %s is an unknown column, cannot remove it." %
                        repr(column_name))

                if column_name in key_column_names:
                    raise KGTKException(
                        "Column %s is a key column, cannot remove it." %
                        repr(column_name))

                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)
                    if base_name not in kr.column_names:
                        raise KGTKException(
                            "For column name %s, base name %s is not known" %
                            (repr(column_name), repr(base_name)))

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    raise KGTKException(
                        "Unable to parse column name %s, no separator (%s)." %
                        (repr(column_name), repr(lift_separator)))

                lower_map[kr.column_name_map[column_name]] = (
                    kr.column_name_map[base_name], new_label_value)

        elif columns_to_lower is None or len(columns_to_lower) == 0:
            # Pattern 3: len(columns_to_lower) == 0.
            # Any column that matches a lift pattern against one of the
            # key columns (node1, label, node2, id, or their aliases)
            # will be lowered.
            if len(lift_separator) == 0:
                raise KGTKException("The --lift-separator must not be empty.")

            if base_columns is None or len(base_columns) == 0:
                # The base name list wasn't supplied.  Use [node1, label, node2, id]
                base_columns = list(key_column_names)
                if verbose:
                    print("Using the default base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)
            else:
                if verbose:
                    print("Using these base columns: %s" %
                          " ".join(base_columns),
                          file=error_file,
                          flush=True)

            for idx, column_name in enumerate(kr.column_names):
                # Skip the node1, label, node12, and id columns
                if idx in key_column_idxs:
                    if verbose:
                        print("column %s is a key column, skipping." %
                              repr(column_name),
                              file=error_file,
                              flush=True)
                    continue

                # Does this column match a lifting pattern?
                if lower and lift_separator in column_name:
                    base_name, new_label_value = column_name.split(
                        lift_separator, 1)

                    if base_name not in base_columns:
                        if verbose:
                            print(
                                "Column %s contains base name %s, which is not a base column."
                                % (repr(column_name), repr(base_name)),
                                file=error_file,
                                flush=True)
                        continue

                elif normalize:
                    base_name = id_column_name
                    new_label_value = column_name

                else:
                    if verbose:
                        print(
                            "Column %s does not contain the separator %s and not normalizing, skipping."
                            % (repr(column_name), repr(lift_separator)),
                            file=error_file,
                            flush=True)
                    continue

                # This test should be redundant.
                if base_name in kr.column_names:
                    lower_map[idx] = (kr.column_name_map[base_name],
                                      new_label_value)
                else:
                    raise KGTKException(
                        "Base name %s was unexpectedly not found." %
                        repr(base_name))

        if len(lower_map) == 0:
            raise KGTKException("There are no columns to lower or normalize.")

        if verbose:
            print("The following columns will be lowered or normalized",
                  file=error_file,
                  flush=True)
            for idx in sorted(lower_map.keys()):
                column_name = kr.column_names[idx]
                base_idx, new_label_value = lower_map[idx]
                base_name = kr.column_names[base_idx]
                print(" %s from %s (label %s)" %
                      (column_name, base_name, repr(new_label_value)),
                      file=error_file,
                      flush=True)

        output_column_names: typing.List[str] = list()
        for idx, column_name in enumerate(kr.column_names):
            if idx not in lower_map:
                output_column_names.append(column_name)

        # Create the ID builder.
        idb: typing.Optional[KgtkIdBuilder] = None
        if add_id:
            idb = KgtkIdBuilder.from_column_names(output_column_names,
                                                  idbuilder_options)
            output_column_names = idb.column_names.copy()

        if verbose:
            print("The output columns are: %s" % " ".join(output_column_names),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            output_column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode.EDGE,
            require_all_columns=False,  # Simplifies writing the labels
            verbose=verbose,
            very_verbose=very_verbose)
        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        lkw: typing.Optional[KgtkWriter] = None
        if new_edges_kgtk_file is not None:
            if verbose:
                print("Opening the label output file: %s" %
                      str(new_edges_kgtk_file),
                      file=error_file,
                      flush=True)

            label_column_names = [
                node1_column_name, label_column_name, node2_column_name
            ]
            lkw = KgtkWriter.open(label_column_names,
                                  new_edges_kgtk_file,
                                  mode=KgtkWriter.Mode.EDGE,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        # Optionally deduplicate the labels
        #  set(node1_value + KgtkFormat.SEPARATOR + node2_value)
        label_set: typing.Set[str] = set()
        label_key: str

        input_line_count: int = 0
        output_line_count: int = 0
        label_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            output_row: typing.List[str] = kw.shuffle(
                row, shuffle_list=shuffle_list)
            kw.write(output_row)
            output_line_count += 1

            id_seq_num: int = 0
            column_idx: int
            for column_idx in lower_map.keys():
                node1_idx: int
                node1_idx, new_label_value = lower_map[column_idx]
                node1_value: str
                node1_value = row[node1_idx]
                if len(node1_value) == 0:
                    if ignore_empty_node1:
                        continue  # TODO: raise an exception
                    else:
                        raise KGTKException(
                            "Empty node1 value when lowering %d to %d: %s in input line %d"
                            % (column_idx, node1_idx, new_label_value,
                               input_line_count))

                item: str = row[column_idx]
                if len(item) == 0:
                    if ignore_empty_node2:
                        continue  # Ignore empty node2 values.
                    else:
                        raise KGTKException(
                            "Empty node2 value when lowering %d to %d: %s in input line %d"
                            % (column_idx, node1_idx, new_label_value,
                               input_line_count))

                # Ths item might be a KGTK list.  Let's split it, because
                # lists aren't allow in the node2 values we'll generate.
                node2_value: str
                for node2_value in KgtkValue.split_list(item):
                    if len(node2_value) == 0:
                        if ignore_empty_node2:
                            continue  # Ignore empty node2 values in a list.
                        else:
                            raise KGTKException(
                                "Empty node2 value in a list when lowering %d to %d: %s in input line %d"
                                % (column_idx, node1_idx, new_label_value,
                                   input_line_count))

                    if deduplicate_new_edges:
                        label_key = node1_value + KgtkFormat.KEY_FIELD_SEPARATOR + new_label_value + KgtkFormat.KEY_FIELD_SEPARATOR + node2_value
                        if label_key in label_set:
                            continue
                        else:
                            label_set.add(label_key)

                    lowered_input_row: typing.List[str] = [
                        "" for idx in range(kr.column_count)
                    ]
                    lowered_input_row[kr.node1_column_idx] = node1_value
                    lowered_input_row[kr.label_column_idx] = new_label_value
                    lowered_input_row[kr.node2_column_idx] = node2_value

                    lowered_output_row: typing.List[str] = kw.shuffle(
                        lowered_input_row, shuffle_list=shuffle_list)
                    if idb is not None:
                        id_seq_num += 0
                        lowered_output_row = idb.build(lowered_output_row,
                                                       id_seq_num,
                                                       already_added=True)
                    if lkw is not None:
                        lkw.write(lowered_output_row)
                        label_line_count += 1
                    else:
                        kw.write(lowered_output_row)
                        label_line_count += 1
                        output_line_count += 1

        if verbose:
            print("Read %d rows, wrote %d rows with %d labels." %
                  (input_line_count, output_line_count, label_line_count),
                  file=error_file,
                  flush=True)

        kw.close()
        if lkw is not None:
            lkw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Exemple #12
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        # First create the KgtkReader.  It provides parameters used by the ID
        # column builder. Next, create the ID column builder, which provides a
        # possibly revised list of column names for the KgtkWriter.  Create
        # the KgtkWriter.  Last, process the data stream.

        # Open the input file.
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Create the ID builder.
        idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        # Process the input file, building IDs.
        idb.process(kr, ew)

        # Clean up.
        ew.close()
        kr.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Exemple #13
0
def add_arguments_extended(parser: KGTKArgumentParser,
                           parsed_shared_args: Namespace):
    """
    Parse arguments
    Args:
        parser (argparse.ArgumentParser)
    """
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.utils.argparsehelpers import optional_bool
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    _expert: bool = parsed_shared_args._expert

    # This helper function makes it easy to suppress options from
    # The help message.  The options are still there, and initialize
    # what they need to initialize.
    def h(msg: str) -> str:
        if _expert:
            return msg
        else:
            return SUPPRESS

    parser.add_input_file(positional=True)
    parser.add_output_file()

    parser.add_argument(
        "--columns",
        dest="key_column_names",
        help="The key columns to identify records for compaction. " +
        "(default=id for node files, (node1, label, node2, id) for edge files).",
        nargs='+',
        default=[])

    parser.add_argument(
        "--compact-id",
        dest="compact_id",
        help=
        "Indicate that the ID column in KGTK edge files should be compacted. "
        + "Normally, if the ID column exists, it is not compacted, " +
        "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    parser.add_argument(
        "--presorted",
        dest="sorted_input",
        help=
        "Indicate that the input has been presorted (or at least pregrouped) (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    parser.add_argument(
        "--verify-sort",
        dest="verify_sort",
        help=
        "If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True,
        metavar="True|False")

    parser.add_argument(
        "--lists-in-input",
        dest="lists_in_input",
        help=
        "Assume that the input file may contain lists (disable when certain it does not). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--build-id",
        dest="build_id",
        help="Build id values in an id column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    KgtkIdBuilderOptions.add_arguments(parser, expert=_expert)
    KgtkReader.add_debug_arguments(parser, expert=_expert)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=_expert)
    KgtkValueOptions.add_arguments(parser, expert=_expert)
Exemple #14
0
def main():
    """
    Test the KGTK implode processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(
        dest="input_file_path",
        help="The KGTK file with the input data. (default=%(default)s)",
        type=Path,
        nargs="?",
        default="-")

    parser.add_argument(
        "--column",
        dest="column_name",
        help="The name of the column to explode. (default=%(default)s).",
        default="node2")

    parser.add_argument(
        "--types",
        dest="type_names",
        nargs='*',
        help=
        "The KGTK data types for which fields should be imploded. (default=%(default)s).",
        choices=KgtkFormat.DataType.choices(),
        default=KgtkFormat.DataType.choices())

    parser.add_argument(
        "--without",
        dest="without_fields",
        nargs='*',
        help="The KGTK fields to do without. (default=%(default)s).",
        choices=KgtkValueFields.OPTIONAL_DEFAULT_FIELD_NAMES,
        default=None)

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--prefix",
        dest="prefix",
        help="The prefix for exploded column names. (default=%(default)s).",
        default="node2;kgtk:")

    parser.add_argument(
        "--overwrite",
        dest="overwrite_column",
        help=
        "Indicate that it is OK to overwrite an existing imploded column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--validate",
        dest="validate",
        help="Validate imploded values. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--escape-pipes",
        dest="escape_pipes",
        help=
        "When true, pipe characters (|) need to be escaped (\\|) per KGTK file format. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--quantities-include-numbers",
        dest="quantities_include_numbers",
        help=
        "When true, numbers are acceptable quantities. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--general-strings",
        dest="general_strings",
        help=
        "When true, strings may include language qualified strings. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--remove-prefixed-columns",
        dest="remove_prefixed_columns",
        help=
        "When true, remove all columns beginning with the prefix from the output file. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--ignore-unselected-types",
        dest="ignore_unselected_types",
        help=
        "When true, input records with valid but unselected data types will be passed through to output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--retain-unselected-types",
        dest="retain_unselected_types",
        help=
        "When true, input records with valid but unselected data types will be retain existing data on output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--build-id",
        dest="build_id",
        help="Build id values in an id column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--reject-file",
        dest="reject_file_path",
        help=
        "The KGTK file into which to write rejected records (default=%(default)s).",
        type=Path,
        default=None)

    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        # TODO: show ifempty-specific options.
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        print("--column %s" % args.column_name, file=error_file, flush=True)
        print("--prefix %s" % args.prefix, file=error_file, flush=True)
        print("--overwrite %s" % str(args.overwrite_column),
              file=error_file,
              flush=True)
        print("--validate %s" % str(args.validate),
              file=error_file,
              flush=True)
        print("--escape-pipes %s" % str(args.escape_pipes),
              file=error_file,
              flush=True)
        print("--quantities-include-numbers %s" %
              str(args.quantities_include_numbers),
              file=error_file,
              flush=True)
        print("--general-strings %s" % str(args.general_strings),
              file=error_file,
              flush=True)
        print("--remove-prefixed-columns %s" %
              str(args.remove_prefixed_columns),
              file=error_file,
              flush=True)
        print("--ignore-unselected-types %s" %
              str(args.ignore_unselected_types),
              file=error_file,
              flush=True)
        print("--retain-unselected-types %s" %
              str(args.retain_unselected_types),
              file=error_file,
              flush=True)
        if args.type_names is not None:
            print("--types %s" % " ".join(args.type_names),
                  file=error_file,
                  flush=True)
        if args.without_fields is not None:
            print("--without %s" % " ".join(args.without_fields),
                  file=error_file,
                  flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        if args.reject_file_path is not None:
            print("--reject-file=%s" % str(args.reject_file_path),
                  file=error_file,
                  flush=True)
        print("--build-id=%s" % str(args.build_id),
              file=error_file,
              flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    without_fields: typing.List[
        str] = args.without_fields if args.without_fields is not None else list(
        )

    ex: KgtkImplode = KgtkImplode(
        input_file_path=args.input_file_path,
        column_name=args.column_name,
        prefix=args.prefix,
        type_names=args.type_names,
        without_fields=without_fields,
        overwrite_column=args.overwrite_column,
        validate=args.validate,
        escape_pipes=args.escape_pipes,
        quantities_include_numbers=args.quantities_include_numbers,
        general_strings=args.general_strings,
        remove_prefixed_columns=args.remove_prefixed_columns,
        ignore_unselected_types=args.ignore_unselected_types,
        retain_unselected_types=args.retain_unselected_types,
        output_file_path=args.output_file_path,
        reject_file_path=args.reject_file_path,
        build_id=args.build_id,
        idbuilder_options=idbuilder_options,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    ex.process()
Exemple #15
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        method: str = "blockmodel",
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions =\
    KgtkIdBuilderOptions.from_dict(kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    try:

        # First create the KgtkReader.  It provides parameters used by the ID
        # column builder. Next, create the ID column builder, which provides a
        # possibly revised list of column names for the KgtkWriter.  Create
        # the KgtkWriter.  Last, process the data stream.

        # Open the input file.
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        g = Graph(directed=False)

        d = {}
        count = 0
        nodes = []
        edges = []
        for row in kr:
            if row[kr.node1_column_idx] not in d:
                d[row[kr.node1_column_idx]] = count
                count = count + 1
                nodes.append(row[kr.node1_column_idx])
            if row[kr.node2_column_idx] not in d:
                d[row[kr.node2_column_idx]] = count
                count = count + 1
                nodes.append(row[kr.node2_column_idx])
            edges.append((row[kr.node1_column_idx], row[kr.node2_column_idx]))

        vlist = g.add_vertex(len(d))

        for ele in edges:
            g.add_edge(g.vertex(d[ele[0]]), g.vertex(d[ele[1]]))

        if method == 'blockmodel':
            state = graph_tool.inference.minimize.minimize_blockmodel_dl(g)
            arr = []

            for i in range(0, len(nodes)):
                arr.append('cluster_' + str(state.get_blocks()[i]))

            kw: KgtkWriter = KgtkWriter.open(
                ["node1", "label", "node2"],
                output_kgtk_file,
                verbose=verbose,
                very_verbose=very_verbose,
            )

            for i in range(0, len(nodes)):
                kw.write([nodes[i], 'in', arr[i]])

        elif method == 'nested':
            state = graph_tool.inference.minimize.\
            minimize_nested_blockmodel_dl(g)

            arr = []

            for i in range(0, len(nodes)):
                arr.append([str(i)])

            for i in range(0, len(state.levels)):
                if state.levels[i].get_B() == 1:
                    break
                for j in range(0, len(arr)):
                    arr[j].insert(
                        0,
                        str(state.levels[i].get_blocks()[arr[j][len(arr[j]) -
                                                                1]]))
            for i in range(0, len(nodes)):
                if len(arr[i]) > 0:
                    arr[i].pop()
                arr[i] = 'cluster_' + '_'.join(arr[i])

            kw: KgtkWriter = KgtkWriter.open(
                ["node1", "label", "node2"],
                output_kgtk_file,
                verbose=verbose,
                very_verbose=very_verbose,
            )
            for i in range(0, len(nodes)):
                kw.write([nodes[i], 'in', arr[i]])
        elif method == 'mcmc':
            state = graph_tool.inference.minimize.minimize_blockmodel_dl(g)
            graph_tool.inference.mcmc.\
                mcmc_equilibrate(state, wait=1000, mcmc_args=dict(niter=10))

            dS, nattempts, nmoves = state.multiflip_mcmc_sweep(niter=1000)
            graph_tool.inference.mcmc.\
                mcmc_equilibrate(state, wait=10,
                                 nbreaks=2, mcmc_args=dict(niter=10))

            bs = []  # collect some partitions

            def collect_partitions(s):
                bs.append(s.b.a.copy())

            # Now we collect partitions for exactly 100,000 sweeps
            # of 10 sweeps:
            graph_tool.inference.mcmc.mcmc_equilibrate(
                state,
                force_niter=10000,
                mcmc_args=dict(niter=10),
                callback=collect_partitions)

            # Disambiguate partitions and obtain marginals
            pmode = graph_tool.inference.partition_modes.\
                PartitionModeState(bs, converge=True)
            pv = list(pmode.get_marginal(g))
            m = list(pmode.get_max(g))

            kw: KgtkWriter =\
            KgtkWriter.open(["node1", "label", "node2", 'node2;prob'],
                                                 output_kgtk_file,
                                                 verbose=verbose,
                                                 very_verbose=very_verbose,
                                     )

            for i in range(0, len(nodes)):
                kw.write([
                    nodes[i], 'in', 'cluster_' + str(m[i]),
                    str(pv[i][m[i]] / sum(pv[i]))
                ])

        kr.close()
        kw.close()
        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Exemple #16
0
def main():
    """
    Test the KGTK compact processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(dest="input_file_path", help="The KGTK file with the input data (default=%(default)s)", type=Path, nargs="?", default="-")

    parser.add_argument(      "--columns", dest="key_column_names",
                              help="The key columns to identify records for compaction. " +
                              "(default=id for node files, (node1, label, node2, id) for edge files).", nargs='+', default=[ ])

    parser.add_argument(      "--compact-id", dest="compact_id",
                              help="Indicate that the ID column in KGTK edge files should be compacted. " +
                              "Normally, if the ID column exists, it is not compacted, " +
                              "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--presorted", dest="sorted_input",
                              help="Indicate that the input has been presorted (or at least pregrouped). (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--verify-sort", dest="verify_sort",
                              help="If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=True)

    parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-")
    
    parser.add_argument(      "--build-id", dest="build_id",
                              help="Build id values in an id column. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.                                                                                                                          
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(args)    
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

   # Show the final option structures for debugging and documentation.                                                                                             
    if args.show_options:
        print("input: %s" % str(args.input_file_path), file=error_file, flush=True)
        print("--columns %s" % " ".join(args.key_column_names), file=error_file, flush=True)
        print("--compact-id=%s" % str(args.compact_id), file=error_file, flush=True)
        print("--presorted=%s" % str(args.sorted_input), file=error_file, flush=True)
        print("--verify-sort=%s" % str(args.verify_sort), file=error_file, flush=True)
        print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True)
        print("--build-id=%s" % str(args.build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kc: KgtkCompact = KgtkCompact(
        input_file_path=args.input_file_path,
        key_column_names=args.key_column_names,
        compact_id=args.compact_id,
        sorted_input=args.sorted_input,
        verify_sort=args.verify_sort,
        output_file_path=args.output_file_path,
        build_id=args.build_id,
        idbuilder_options=idbuilder_options,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    kc.process()
Exemple #17
0
def add_arguments_extended(parser: KGTKArgumentParser,
                           parsed_shared_args: Namespace):
    """
    Parse arguments
    Args:
        parser (argparse.ArgumentParser)
    """
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.utils.argparsehelpers import optional_bool
    from kgtk.value.kgtkvalue import KgtkValue
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    _expert: bool = parsed_shared_args._expert
    _command: str = parsed_shared_args._command
    _mode: str = parsed_shared_args._mode

    # This helper function makes it easy to suppress options from
    # The help message.  The options are still there, and initialize
    # what they need to initialize.
    def h(msg: str) -> str:
        if _expert:
            return msg
        else:
            return SUPPRESS

    parser.add_input_file(positional=True)
    parser.add_output_file()
    parser.add_output_file(
        who=
        "An optional output file for new edges (normalized and/or lowered). " +
        "If omitted, new edges will go in the main output file.",
        dest="new_edges_file",
        options=["--new-edges-file"],
        metavar="NEW_EDGES_FILE",
        optional=True)

    parser.add_argument(
        "--columns",
        "--columns-to-lower",
        "--columns-to-remove",
        action="store",
        type=str,
        dest="columns_to_lower",
        nargs='+',
        help=
        "Columns to lower and remove as a space-separated list. (default=all columns other than key columns)"
    )

    parser.add_argument(
        "--base-columns",
        dest="base_columns",
        help=
        h("Optionally, explicitly list the base column for each column being lowered. "
          +
          " --base-columns and --columns-to-lower must have the same number of entries."
          ),
        nargs='*')

    parser.add_argument(
        "--label-values",
        action="store",
        type=str,
        dest="label_values",
        nargs='*',
        help=
        h("When not empty, a list of label values to use for lowered edges when --base-columns is used, overriding the original column names. (default=%(default)s)"
          ))

    parser.add_argument(
        "--lift-separator",
        dest="lift_separator",
        help=
        h("The separator between the base column and the label value. (default=%(default)s)."
          ),
        default=KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SEPARATOR)

    parser.add_argument(
        "--ignore-empty-node1",
        dest="ignore_empty_node1",
        help=
        h("When True, ignore attempts to lower into a new record with an empty node1 value. (default=%(default)s)"
          ),
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    parser.add_argument(
        "--ignore-empty-node2",
        dest="ignore_empty_node2",
        help=
        h("When True, ignore attempts to lower into a new record with an empty node2 value. (default=%(default)s)"
          ),
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    parser.add_argument(
        "--add-id",
        dest="add_id",
        help=
        "When True, add an id column to the output (if not already present). (default=%(default)s)",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    if _command == LOWER_COMMAND:
        parser.add_argument(
            "--lower",
            dest="lower",
            help=
            h("When True, lower columns that match a lift pattern. (default=%(default)s)"
              ),
            type=optional_bool,
            nargs='?',
            const=True,
            default=True,
            metavar="True|False")

        parser.add_argument(
            "--normalize",
            dest="normalize",
            help=
            h("When True, normalize columns that do not match a lift pattern. (default=%(default)s)"
              ),
            type=optional_bool,
            nargs='?',
            const=True,
            default=False,
            metavar="True|False")

    elif _command == NORMALIZE_EDGES_COMMAND:
        parser.add_argument(
            "--lower",
            dest="lower",
            help=
            h("When True, lower columns that match a lift pattern. (default=%(default)s)"
              ),
            type=optional_bool,
            nargs='?',
            const=True,
            default=False,
            metavar="True|False")

        parser.add_argument(
            "--normalize",
            dest="normalize",
            help=
            h("When True, normalize columns that do not match a lift pattern. (default=%(default)s)"
              ),
            type=optional_bool,
            nargs='?',
            const=True,
            default=True,
            metavar="True|False")

    else:
        parser.add_argument(
            "--lower",
            dest="lower",
            help=
            "When True, lower columns that match a lift pattern. (default=%(default)s)",
            type=optional_bool,
            nargs='?',
            const=True,
            default=_command != NORMALIZE_EDGES_COMMAND,
            metavar="True|False")

        parser.add_argument(
            "--normalize",
            dest="normalize",
            help=
            "When True, normalize columns that do not match a lift pattern. (default=%(default)s)",
            type=optional_bool,
            nargs='?',
            const=True,
            default=_command != LOWER_COMMAND,
            metavar="True|False")

    parser.add_argument(
        "--deduplicate-new-edges",
        dest="deduplicate_new_edges",
        help=
        "When True, deduplicate new edges. Not suitable for large files. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True,
        metavar="True|False")

    KgtkIdBuilderOptions.add_arguments(
        parser,
        default_style=KgtkIdBuilderOptions.CONCAT_NLN_NUM_STYLE,
        expert=_expert)
    KgtkReader.add_debug_arguments(parser, expert=_expert)
    KgtkReaderOptions.add_arguments(parser,
                                    mode_options=True,
                                    default_mode=KgtkReaderMode.NONE if _mode
                                    == "NONE" else KgtkReaderMode.EDGE,
                                    expert=_expert)
    KgtkValueOptions.add_arguments(parser, expert=_expert)
Exemple #18
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        namespace_file: KGTKFiles,
        updated_namespace_file: KGTKFiles,
        namespace_id_prefix: str,
        namespace_id_use_uuid: bool,
        namespace_id_counter: int,
        namespace_id_zfill: int,
        output_only_used_namespaces: bool,
        allow_lax_uri: bool,
        local_namespace_prefix: str,
        local_namespace_use_uuid: bool,
        prefix_expansion_label: str,
        structured_value_label: str,
        structured_uri_label: str,
        newnode_prefix: str,
        newnode_use_uuid: bool,
        newnode_counter: int,
        newnode_zfill: int,
        build_id: bool,
        escape_pipes: bool,
        validate: bool,
        override_uuid: typing.Optional[str],
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.imports.kgtkntriples import KgtkNtriples
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    # Select where to send error messages, defaulting to stderr.
    input_file_paths: typing.List[
        Path] = KGTKArgumentParser.get_input_file_list(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    reject_file_path: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            reject_file, who="KGTK reject file")

    namespace_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_input_file(
            namespace_file, who="KGTK namespace file")
    updated_namespace_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            updated_namespace_file, who="KGTK updated namespace file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-files %s" %
              " ".join([str(path) for path in input_file_paths]),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)
        if reject_file_path is not None:
            print("--reject-file=%s" % str(reject_file_path),
                  file=error_file,
                  flush=True)
        if namespace_kgtk_file is not None:
            print("--namespace-file=%s" % str(namespace_kgtk_file),
                  file=error_file,
                  flush=True)
        if updated_namespace_kgtk_file is not None:
            print("--updated-namespace-file=%s" %
                  str(updated_namespace_kgtk_file),
                  file=error_file,
                  flush=True)

        print("--namespace-id-prefix %s" % namespace_id_prefix,
              file=error_file,
              flush=True)
        print("--namespace-id-use-uuid %s" % str(namespace_id_use_uuid),
              file=error_file,
              flush=True)
        print("--namespace-id-counter %s" % str(namespace_id_counter),
              file=error_file,
              flush=True)
        print("--namespace-id-zfill %s" % str(namespace_id_zfill),
              file=error_file,
              flush=True)
        print("--output-only-used-namespaces %s" %
              str(output_only_used_namespaces),
              file=error_file,
              flush=True)

        print("--allow-lax-uri %s" % str(allow_lax_uri),
              file=error_file,
              flush=True)

        print("--local-namespace-prefix %s" % local_namespace_prefix,
              file=error_file,
              flush=True)
        print("--local-namespace-use-uuid %s" % str(local_namespace_use_uuid),
              file=error_file,
              flush=True)

        print("--prefix-expansion-label %s" % prefix_expansion_label,
              file=error_file,
              flush=True)
        print("--structured-value-label %s" % structured_value_label,
              file=error_file,
              flush=True)
        print("--structured-uri-label %s" % structured_uri_label,
              file=error_file,
              flush=True)

        print("--newnode-prefix %s" % newnode_prefix,
              file=error_file,
              flush=True)
        print("--newnode-use-uuid %s" % str(newnode_use_uuid),
              file=error_file,
              flush=True)
        print("--newnode-counter %s" % str(newnode_counter),
              file=error_file,
              flush=True)
        print("--newnode-zfill %s" % str(newnode_zfill),
              file=error_file,
              flush=True)

        print("--build-id=%s" % str(build_id), file=error_file, flush=True)

        print("--escape-pipes=%s" % str(escape_pipes),
              file=error_file,
              flush=True)

        print("--validate=%s" % str(validate), file=error_file, flush=True)

        print("--override-uuid=%s" % str(override_uuid),
              file=error_file,
              flush=True)

        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kn: KgtkNtriples = KgtkNtriples(
            input_file_paths=input_file_paths,
            output_file_path=output_kgtk_file,
            reject_file_path=reject_file_path,
            updated_namespace_file_path=updated_namespace_kgtk_file,
            namespace_file_path=namespace_kgtk_file,
            namespace_id_prefix=namespace_id_prefix,
            namespace_id_use_uuid=namespace_id_use_uuid,
            namespace_id_counter=namespace_id_counter,
            namespace_id_zfill=namespace_id_zfill,
            output_only_used_namespaces=output_only_used_namespaces,
            newnode_prefix=newnode_prefix,
            newnode_use_uuid=newnode_use_uuid,
            newnode_counter=newnode_counter,
            newnode_zfill=newnode_zfill,
            allow_lax_uri=allow_lax_uri,
            local_namespace_prefix=local_namespace_prefix,
            local_namespace_use_uuid=local_namespace_use_uuid,
            prefix_expansion_label=prefix_expansion_label,
            structured_value_label=structured_value_label,
            structured_uri_label=structured_uri_label,
            build_id=build_id,
            escape_pipes=escape_pipes,
            validate=validate,
            override_uuid=override_uuid,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kn.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Exemple #19
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        list_output_file: KGTKFiles,
        key_column_names: typing.List[str],
        keep_first_names: typing.List[str],
        compact_id: bool,
        deduplicate: bool,
        sorted_input: bool,
        verify_sort: bool,
        lists_in_input: bool,
        report_lists: bool,
        exclude_lists: bool,
        output_only_lists: bool,
        build_id: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkcompact import KgtkCompact
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    list_output_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(
            list_output_file, who="KGTK list output file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if list_output_kgtk_file is not None:
            print("--list-output-file=%s" % str(list_output_kgtk_file),
                  file=error_file,
                  flush=True)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        print("--keep-first=%s" % " ".join(keep_first_names), file=error_file)
        print("--compact-id=%s" % str(compact_id), file=error_file, flush=True)
        print("--deduplicate=%s" % str(deduplicate),
              file=error_file,
              flush=True)
        print("--presorted=%s" % str(sorted_input),
              file=error_file,
              flush=True)
        print("--verify-sort=%s" % str(verify_sort),
              file=error_file,
              flush=True)
        print("--lists-in-input=%s" % str(lists_in_input),
              file=error_file,
              flush=True)
        print("--report-lists=%s" % str(report_lists),
              file=error_file,
              flush=True)
        print("--exclude-lists=%s" % str(exclude_lists),
              file=error_file,
              flush=True)
        print("--output-only-lists=%s" % str(output_only_lists),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        KgtkReader.show_debug_arguments(errors_to_stdout=errors_to_stdout,
                                        errors_to_stderr=errors_to_stderr,
                                        show_options=show_options,
                                        verbose=verbose,
                                        very_verbose=very_verbose,
                                        out=error_file)
        print("=======", file=error_file, flush=True)

    if exclude_lists and output_only_lists:
        raise KGTKException(
            "--exclude-lists and --output-only-lists may not be used together."
        )

    try:
        ex: KgtkCompact = KgtkCompact(
            input_file_path=input_kgtk_file,
            output_file_path=output_kgtk_file,
            list_output_file_path=list_output_kgtk_file,
            key_column_names=key_column_names,
            keep_first_names=keep_first_names,
            compact_id=compact_id,
            deduplicate=deduplicate,
            sorted_input=sorted_input,
            verify_sort=verify_sort,
            lists_in_input=lists_in_input,
            report_lists=report_lists,
            exclude_lists=exclude_lists,
            output_only_lists=output_only_lists,
            build_id=build_id,
            idbuilder_options=idbuilder_options,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))