Beispiel #1
0
def main():
    """
    Test the KGTK node file reader.
    """
    parser = ArgumentParser()
    parser.add_argument(dest="kgtk_file", help="The KGTK edge file to read", type=Path, nargs="?")
    KgtkReader.add_debug_arguments(parser, expert=True)
    KgtkReaderOptions.add_arguments(parser, validate_by_default=True, expert=True)
    KgtkValueOptions.add_arguments(parser, expert=True)
    args = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, mode=KgtkReaderMode.NODE)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    if args.show_options:
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    nr: NodeReader = NodeReader.open_node_file(args.kgtk_file,
                                               error_file=error_file,
                                               options=reader_options,
                                               value_options=value_options,
                                               verbose=args.verbose, very_verbose=args.very_verbose)

    line_count: int = 0
    row: typing.List[str]
    for row in nr:
        line_count += 1
    print("Read %d lines" % line_count)
Beispiel #2
0
def main():
    """
    Test the KGTK ifempty processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(dest="input_file_path", help="The KGTK file with the input data (default=%(default)s)", type=Path, nargs="?", default="-")

    parser.add_argument(      "--columns", dest="key_column_names", help="The key columns will not be expanded (default=None).", nargs='+', default = [ ])

    parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-")
    
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.                                                                                                                          
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

   # Show the final option structures for debugging and documentation.                                                                                             
    if args.show_options:
        # TODO: show ifempty-specific options.
        print("input: %s" % str(args.input_file_path), file=error_file, flush=True)
        print("--columns %s" % " ".join(args.key_column_names), file=error_file, flush=True)
        print("--output-file=%s" % str(args.output_file_path))
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    ex: KgtkExpand = KgtkExpand(
        input_file_path=args.input_file_path,
        key_column_names=args.key_column_names,
        output_file_path=args.output_file_path,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    ex.process()
Beispiel #3
0
def main():
    """
    Test the KGTK copy template.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument("-i", "--input-file", dest="input_file_path",
                        help="The KGTK input file. (default=%(default)s)", type=Path, default="-")

    parser.add_argument("-o", "--output-file", dest="output_file_path",
                        help="The KGTK output file. (default=%(default)s).", type=Path, default="-")
    
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.                                                                                                                          
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

   # Show the final option structures for debugging and documentation.                                                                                             
    if args.show_options:
        print("--input-files %s" % " ".join([str(path) for  path in input_file_paths]), file=error_file, flush=True)
        print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kct: KgtkCopyTemplate = KgtkCopyTemplate(
        input_file_path=args.input_file_path,
        output_file_path=args.output_file_path,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose,
    )

    kct.process()
Beispiel #4
0
def main():
    """
    Test the KGTK compact processor.

    TODO: Support the list output file.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(
        dest="input_file_path",
        help="The KGTK file with the input data (default=%(default)s)",
        type=Path,
        nargs="?",
        default="-")

    parser.add_argument(
        "--columns",
        dest="key_column_names",
        help="The key columns to identify records for compaction. " +
        "(default=id for node files, (node1, label, node2, id) for edge files).",
        nargs='+',
        default=[])

    parser.add_argument(
        "--keep-first",
        dest="keep_first_names",
        help=
        "If compaction results in a list of values for any column on this list, keep only the first value after sorting. "
        + "(default=none).",
        nargs='+',
        default=[])

    parser.add_argument(
        "--compact-id",
        dest="compact_id",
        help=
        "Indicate that the ID column in KGTK edge files should be compacted. "
        + "Normally, if the ID column exists, it is not compacted, " +
        "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--deduplicate",
        dest="deduplicate",
        help=
        "Treat all columns as key columns, overriding --columns and --compact-id. "
        +
        "This will remove completely duplicate records without compacting any new lists. "
        + "(default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False,
        metavar="True|False")

    parser.add_argument(
        "--presorted",
        dest="sorted_input",
        help=
        "Indicate that the input has been presorted (or at least pregrouped). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--verify-sort",
        dest="verify_sort",
        help=
        "If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--lists-in-input",
        dest="lists_in_input",
        help=
        "Assume that the input file may contain lists (disable when certain it does not). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--report-lists",
        dest="report_lists",
        help=
        "When True, report records with lists to the error output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--exclude-lists",
        dest="exclude_lists",
        help=
        "When True, exclude records with lists from the output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--output-only-lists",
        dest="output_only_lists",
        help=
        "When True, output only records containing lists. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--build-id",
        dest="build_id",
        help="Build id values in an id column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        print("--columns %s" % " ".join(args.key_column_names),
              file=error_file,
              flush=True)
        print("--keep-first %s" % " ".join(args.keep_first_names),
              file=error_file,
              flush=True)
        print("--compact-id=%s" % str(args.compact_id),
              file=error_file,
              flush=True)
        print("--deduplicate=%s" % str(args.deduplicate),
              file=error_file,
              flush=True)
        print("--presorted=%s" % str(args.sorted_input),
              file=error_file,
              flush=True)
        print("--verify-sort=%s" % str(args.verify_sort),
              file=error_file,
              flush=True)
        print("--lists-in-input=%s" % str(args.lists_in_input),
              file=error_file,
              flush=True)
        print("--report-lists=%s" % str(args.report_lists),
              file=error_file,
              flush=True)
        print("--exclude-lists=%s" % str(args.exclude_lists),
              file=error_file,
              flush=True)
        print("--output-only-lists=%s" % str(args.output_only_lists),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(args.build_id),
              file=error_file,
              flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kc: KgtkCompact = KgtkCompact(input_file_path=args.input_file_path,
                                  key_column_names=args.key_column_names,
                                  keep_first_names=args.keep_first_names,
                                  compact_id=args.compact_id,
                                  deduplicate=args.deduplicate,
                                  sorted_input=args.sorted_input,
                                  verify_sort=args.verify_sort,
                                  lists_in_input=args.lists_in_input,
                                  report_lists=args.report_lists,
                                  exclude_lists=args.exclude_lists,
                                  output_only_lists=args.output_only_lists,
                                  output_file_path=args.output_file_path,
                                  build_id=args.build_id,
                                  idbuilder_options=idbuilder_options,
                                  reader_options=reader_options,
                                  value_options=value_options,
                                  error_file=error_file,
                                  verbose=args.verbose,
                                  very_verbose=args.very_verbose)

    kc.process()
Beispiel #5
0
def main():
    """
    Test the KGTK ntriples importer.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(
        "-i",
        "--input-files",
        dest="input_file_paths",
        nargs='*',
        help="The file(s) with the input ntriples data. (default=%(default)s)",
        type=Path,
        default="-")

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--reject-file",
        dest="reject_file_path",
        help=
        "The KGTK file into which to write rejected records. (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument(
        "--namespace-file",
        dest="namespace_file_path",
        help="The KGTK file with known namespaces. (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument(
        "--updated-namespace-file",
        dest="updated_namespace_file_path",
        help=
        "An updated KGTK file with known namespaces. (default=%(default)s).",
        type=Path,
        default=None)

    KgtkNtriples.add_arguments(parser)
    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("--input-files %s" %
              " ".join([str(path) for path in input_file_paths]),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        # TODO: show ifempty-specific options.
        if args.reject_file_path is not None:
            print("--reject-file=%s" % str(args.reject_file_path),
                  file=error_file,
                  flush=True)
        if args.namespace_file_path is not None:
            print("--namespace-file=%s" % str(args.namespace_file_path),
                  file=error_file,
                  flush=True)
        if args.updated_namespace_file_path is not None:
            print("--updated-namespace-file=%s" %
                  str(args.updated_namespace_file_path),
                  file=error_file,
                  flush=True)
        print("--namespace-id-prefix %s" % args.namespace_id_prefix,
              file=error_file,
              flush=True)
        print("--namespace-id-use-uuid %s" % str(args.namespace_id_use_uuid),
              file=error_file,
              flush=True)
        print("--namespace-id-counter %s" % str(args.namespace_id_counter),
              file=error_file,
              flush=True)
        print("--namespace-id-zfill %s" % str(args.namespace_id_zfill),
              file=error_file,
              flush=True)
        print("--output-only-used-namespaces %s" %
              str(args.output_only_used_namespaces),
              file=error_file,
              flush=True)
        print("--allow-lax-uri %s" % str(args.allow_lax_uri),
              file=error_file,
              flush=True)
        print("--local-namespace-prefix %s" % args.local_namespace_prefix,
              file=error_file,
              flush=True)
        print("--local-namespace-use-uuid %s" %
              str(args.local_namespace_use_uuid),
              file=error_file,
              flush=True)
        print("--prefix-expansion-label %s" % args.prefix_expansion_label,
              file=error_file,
              flush=True)
        print("--structured-value-label %s" % args.structured_value_label,
              file=error_file,
              flush=True)
        print("--structured-uri-label %s" % args.structured_uri_label,
              file=error_file,
              flush=True)
        print("--newnode-prefix %s" % args.newnode_prefix,
              file=error_file,
              flush=True)
        print("--newnode-use-uuid %s" % str(args.newnode_use_uuid),
              file=error_file,
              flush=True)
        print("--newnode-counter %s" % str(args.newnode_counter),
              file=error_file,
              flush=True)
        print("--newnode-zfill %s" % str(args.newnode_zfill),
              file=error_file,
              flush=True)
        print("--build-id=%s" % str(args.build_id),
              file=error_file,
              flush=True)
        print("--escape-pipes=%s" % str(args.escape_pipes),
              file=error_file,
              flush=True)
        print("--validate=%s" % str(args.validate),
              file=error_file,
              flush=True)
        if args.override_uuid is not None:
            print("--override_uuid=%s" % str(args.override_uuid),
                  file=error_file,
                  flush=True)

        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kn: KgtkNtriples = KgtkNtriples(
        input_file_paths=args.input_file_paths,
        output_file_path=args.output_file_path,
        reject_file_path=args.reject_file_path,
        namespace_file_path=args.namespace_file_path,
        updated_namespace_file_path=args.updated_namespace_file_path,
        namespace_id_prefix=args.namespace_id_prefix,
        namespace_id_use_uuid=args.namespace_id_use_uuid,
        namespace_id_counter=args.namespace_id_counter,
        namespace_id_zfill=args.namespace_id_zfill,
        output_only_used_namespaces=args.output_only_used_namespaces,
        newnode_prefix=args.newnode_prefix,
        newnode_use_uuid=args.newnode_use_uuid,
        newnode_counter=args.newnode_counter,
        newnode_zfill=args.newnode_zfill,
        allow_lax_uri=args.allow_lax_uri,
        local_namespace_prefix=args.local_namespace_prefix,
        local_namespace_use_uuid=args.local_namespace_use_uuid,
        prefix_expansion_label=args.prefix_expansion_label,
        structured_value_label=args.structured_value_label,
        structured_uri_label=args.structured_uri_label,
        build_id=args.build_id,
        escape_pipes=args.escape_pipes,
        idbuilder_options=idbuilder_options,
        validate=args.validate,
        override_uuid=args.override_uuid,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    kn.process()
Beispiel #6
0
def main():
    """
    Test the KGTK unique processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(dest="input_file_path",
                        help="The KGTK file with the input data",
                        type=Path,
                        nargs="?")

    parser.add_argument("--column",
                        dest="column_name",
                        help="The column to count unique values (required).",
                        required=True)

    parser.add_argument(
        "--empty",
        dest="empty_value",
        help="A value to substitute for empty values (default=%(default)s).",
        default="")

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--label",
        dest="label_value",
        help="The output file label column value (default=%(default)s).",
        default="count")

    # TODO: use an enum
    parser.add_argument(
        "--format",
        dest="output_format",
        help="The output file format and mode (default=%(default)s).",
        default="edge",
        choices=["edge", "node"])

    parser.add_argument("--prefix",
                        dest="prefix",
                        help="The value prefix (default=%(default)s).",
                        default="")

    parser.add_argument(
        "--where",
        dest="where_column_name",
        help=
        "The name of a column for a record selection test. (default=%(default)s).",
        default=None)

    parser.add_argument(
        "--in",
        dest="where_values",
        nargs="+",
        help=
        "The list of values for a record selection test. (default=%(default)s).",
        default=None)

    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("input: %s" % (str(args.input_file_path)
                             if args.input_file_path is not None else "-"),
              file=error_file)
        print("--column=%s" % args.column_name, file=error_file)
        print("--empty=%s" % args.empty_value, file=error_file)
        print("--output-file=%s" % str(args.output_file_path), file=error_file)
        print("--label=%s" % args.label_value, file=error_file)
        print("--format=%s" % args.output_format, file=error_file)
        print("--prefix=%s" % args.prefix, file=error_file)
        if args.where_column_name is not None:
            print("--where=%s" % args.where_column_name, file=error_file)
        if args.where_values is not None and len(args.where_values) > 0:
            print("--in=%s" % " ".join(args.where_values), file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    uniq: Unique = Unique(input_file_path=args.input_file_path,
                          column_name=args.column_name,
                          output_file_path=args.output_file_path,
                          empty_value=args.empty_value,
                          label_value=args.label_value,
                          output_format=args.output_format,
                          prefix=args.prefix,
                          where_column_name=args.where_column_name,
                          where_values=args.where_values,
                          reader_options=reader_options,
                          value_options=value_options,
                          error_file=error_file,
                          verbose=args.verbose,
                          very_verbose=args.very_verbose)

    uniq.process()
Beispiel #7
0
def main():
    """
    Test the KGTK file joiner.

    Edge files can be joined to edge files.
    Node files can be joined to node files.

    TODO: Add more KgtkReader parameters, especially mode.
    """
    parser = ArgumentParser()
    parser.add_argument(dest="left_file_path", help="The left KGTK file to join", type=Path)
    parser.add_argument(dest="right_file_path", help="The right KGTK file to join", type=Path)
    parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write", type=Path, default=None)

    parser.add_argument(      "--field-separator", dest="field_separator", help="Separator for multifield keys", default=KgtkJoiner.FIELD_SEPARATOR_DEFAULT)

    parser.add_argument(      "--join-on-id", dest="join_on_id",
                              help="If both input files are edge files, include the id column in the join (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)
    
    parser.add_argument(      "--join-on-label", dest="join_on_label",
                              help="If both input files are edge files, include the label column in the join (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)
    
    parser.add_argument(      "--join-on-node2", dest="join_on_node2",
                              help="If both input files are edge files, include the node2 column in the join (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)
    
    parser.add_argument(      "--left-prefix", dest="left_prefix", help="An optional prefix applied to left file column names in the output file (default=None).")
    parser.add_argument(      "--left-join", dest="left_join", help="Perform a left outer join (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)
    parser.add_argument(      "--left-file-join-columns", dest="left_join_columns", help="Left file join columns.", nargs='+')

    parser.add_argument(      "--right-prefix", "--prefix", dest="right_prefix", help="An optional prefix applied to right file column names in the output file (default=None).")
    parser.add_argument(      "--right-file-join-columns", dest="right_join_columns", help="Right file join columns.", nargs='+')
    parser.add_argument(      "--right-join", dest="right_join", help="Perform a right outer join (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    KgtkReader.add_debug_arguments(parser, expert=True)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, who=KgtkJoiner.LEFT, expert=True)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, who=KgtkJoiner.RIGHT, expert=True)
    KgtkValueOptions.add_arguments(parser, expert=True)

    args = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    left_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who=KgtkJoiner.LEFT)
    right_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who=KgtkJoiner.RIGHT)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

   # Show the final option structures for debugging and documentation.                                                                                             
    if args.show_options:
        if args.left_prefix is not None:
            print("--left-prefix=%s" % args.left_prefix, file=error_file, flush=True)
        if args.right_prefix is not None:
            print("--right-prefix=%s" % args.right_prefix, file=error_file, flush=True)
        left_reader_options.show(out=error_file, who=KgtkJoiner.LEFT)
        right_reader_options.show(out=error_file, who=KgtkJoiner.RIGHT)
        value_options.show(out=error_file)

    ej: KgtkJoiner = KgtkJoiner(left_file_path=args.left_file_path,
                                right_file_path=args.right_file_path,
                                output_path=args.output_file_path,
                                left_join=args.left_join,
                                right_join=args.right_join,
                                join_on_id=args.join_on_id,
                                join_on_label=args.join_on_label,
                                join_on_node2=args.join_on_node2,
                                left_join_columns=args.left_join_columns,
                                right_join_columns=args.right_join_columns,
                                left_prefix=args.left_prefix,
                                right_prefix=args.right_prefix,
                                field_separator=args.field_separator,
                                left_reader_options=left_reader_options,
                                right_reader_options=right_reader_options,
                                value_options=value_options,
                                error_file=error_file,
                                verbose=args.verbose,
                                very_verbose=args.very_verbose)

    ej.process()
Beispiel #8
0
def main():
    """
    Test the KGTK ID builder.
    """
    parser: ArgumentParser = ArgumentParser()
    parser.add_argument(
        dest="input_file_path",
        help="The KGTK file with the input data (default=%(default)s)",
        type=Path,
        nargs="?",
        default="-")
    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    KgtkIdBuilderOptions.add_arguments(parser)

    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    # First create the KgtkReader.  It provides parameters used by the ID
    # column builder. Next, create the ID column builder, which provides a
    # possibly revised list of column names for the KgtkWriter.  Last, create
    # the KgtkWriter.

    # Open the input file.
    kr: KgtkReader = KgtkReader.open(
        args.input_file_path,
        error_file=error_file,
        options=reader_options,
        value_options=value_options,
        verbose=args.verbose,
        very_verbose=args.very_verbose,
    )

    # Create the ID builder.
    idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

    # Open the output file.
    ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                     args.output_file_path,
                                     mode=kr.mode,
                                     require_all_columns=True,
                                     prohibit_extra_columns=True,
                                     fill_missing_columns=False,
                                     gzip_in_parallel=False,
                                     verbose=args.verbose,
                                     very_verbose=args.very_verbose)

    # Process the input file, building IDs.
    idb.process(kr, ew)

    ew.close()
    kr.close()
Beispiel #9
0
def main():
    """
    Test the KGTK ifempty processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(
        dest="input_file_path",
        help="The KGTK file with the input data. (default=%(default)s)",
        type=Path,
        nargs="?",
        default="-")

    parser.add_argument(
        "--column",
        dest="column_name",
        help="The name of the column to explode. (default=%(default)s).",
        default="node2")

    fgroup: ArgumentParser = parser.add_mutually_exclusive_group()

    fgroup.add_argument(
        "--types",
        dest="type_names",
        nargs='*',
        help=
        "The KGTK data types for which fields should be exploded. (default=%(default)s).",
        choices=KgtkFormat.DataType.choices(),
        default=KgtkFormat.DataType.choices())

    fgroup.add_argument(
        "--fields",
        dest="field_names",
        nargs='*',
        help=
        "The names of the fields to extract (overrides --types). (default=%(default)s).",
        choices=KgtkValueFields.FIELD_NAMES)

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--prefix",
        dest="prefix",
        help="The prefix for exploded column names. (default=%(default)s).",
        default="node2;kgtk:")

    parser.add_argument(
        "--overwrite",
        dest="overwrite_columns",
        help=
        "Indicate that it is OK to overwrite existing columns. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--expand",
        dest="expand_list",
        help=
        "Expand the source column if it contains a list, else fail. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        # TODO: show ifempty-specific options.
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        print("--column %s" % args.column_name, file=error_file, flush=True)
        print("--prefix %s" % args.prefix, file=error_file, flush=True)
        print("--overwrite %s" % str(args.overwrite_columns),
              file=error_file,
              flush=True)
        print("--expand %s" % str(args.expand_list),
              file=error_file,
              flush=True)
        if args.field_names is not None:
            print("--fields %s" % " ".join(args.field_names),
                  file=error_file,
                  flush=True)
        if args.type_names is not None:
            print("--types %s" % " ".join(args.type_names),
                  file=error_file,
                  flush=True)
        print("--output-file=%s" % str(args.output_file_path))
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    ex: KgtkExplode = KgtkExplode(input_file_path=args.input_file_path,
                                  column_name=args.column_name,
                                  prefix=args.prefix,
                                  field_names=args.field_names,
                                  type_names=args.type_names,
                                  overwrite_columns=args.overwrite_columns,
                                  expand_list=args.expand_list,
                                  output_file_path=args.output_file_path,
                                  reader_options=reader_options,
                                  value_options=value_options,
                                  error_file=error_file,
                                  verbose=args.verbose,
                                  very_verbose=args.very_verbose)

    ex.process()
Beispiel #10
0
def main():
    """
    Test the KGTK implode processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(
        dest="input_file_path",
        help="The KGTK file with the input data. (default=%(default)s)",
        type=Path,
        nargs="?",
        default="-")

    parser.add_argument(
        "--column",
        dest="column_name",
        help="The name of the column to explode. (default=%(default)s).",
        default="node2")

    parser.add_argument(
        "--types",
        dest="type_names",
        nargs='*',
        help=
        "The KGTK data types for which fields should be imploded. (default=%(default)s).",
        choices=KgtkFormat.DataType.choices(),
        default=KgtkFormat.DataType.choices())

    parser.add_argument(
        "--without",
        dest="without_fields",
        nargs='*',
        help="The KGTK fields to do without. (default=%(default)s).",
        choices=KgtkValueFields.OPTIONAL_DEFAULT_FIELD_NAMES,
        default=None)

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--prefix",
        dest="prefix",
        help="The prefix for exploded column names. (default=%(default)s).",
        default="node2;kgtk:")

    parser.add_argument(
        "--overwrite",
        dest="overwrite_column",
        help=
        "Indicate that it is OK to overwrite an existing imploded column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--validate",
        dest="validate",
        help="Validate imploded values. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--escape-pipes",
        dest="escape_pipes",
        help=
        "When true, pipe characters (|) need to be escaped (\\|) per KGTK file format. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--quantities-include-numbers",
        dest="quantities_include_numbers",
        help=
        "When true, numbers are acceptable quantities. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--general-strings",
        dest="general_strings",
        help=
        "When true, strings may include language qualified strings. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--remove-prefixed-columns",
        dest="remove_prefixed_columns",
        help=
        "When true, remove all columns beginning with the prefix from the output file. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--ignore-unselected-types",
        dest="ignore_unselected_types",
        help=
        "When true, input records with valid but unselected data types will be passed through to output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--retain-unselected-types",
        dest="retain_unselected_types",
        help=
        "When true, input records with valid but unselected data types will be retain existing data on output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--build-id",
        dest="build_id",
        help="Build id values in an id column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--reject-file",
        dest="reject_file_path",
        help=
        "The KGTK file into which to write rejected records (default=%(default)s).",
        type=Path,
        default=None)

    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        # TODO: show ifempty-specific options.
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        print("--column %s" % args.column_name, file=error_file, flush=True)
        print("--prefix %s" % args.prefix, file=error_file, flush=True)
        print("--overwrite %s" % str(args.overwrite_column),
              file=error_file,
              flush=True)
        print("--validate %s" % str(args.validate),
              file=error_file,
              flush=True)
        print("--escape-pipes %s" % str(args.escape_pipes),
              file=error_file,
              flush=True)
        print("--quantities-include-numbers %s" %
              str(args.quantities_include_numbers),
              file=error_file,
              flush=True)
        print("--general-strings %s" % str(args.general_strings),
              file=error_file,
              flush=True)
        print("--remove-prefixed-columns %s" %
              str(args.remove_prefixed_columns),
              file=error_file,
              flush=True)
        print("--ignore-unselected-types %s" %
              str(args.ignore_unselected_types),
              file=error_file,
              flush=True)
        print("--retain-unselected-types %s" %
              str(args.retain_unselected_types),
              file=error_file,
              flush=True)
        if args.type_names is not None:
            print("--types %s" % " ".join(args.type_names),
                  file=error_file,
                  flush=True)
        if args.without_fields is not None:
            print("--without %s" % " ".join(args.without_fields),
                  file=error_file,
                  flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        if args.reject_file_path is not None:
            print("--reject-file=%s" % str(args.reject_file_path),
                  file=error_file,
                  flush=True)
        print("--build-id=%s" % str(args.build_id),
              file=error_file,
              flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    without_fields: typing.List[
        str] = args.without_fields if args.without_fields is not None else list(
        )

    ex: KgtkImplode = KgtkImplode(
        input_file_path=args.input_file_path,
        column_name=args.column_name,
        prefix=args.prefix,
        type_names=args.type_names,
        without_fields=without_fields,
        overwrite_column=args.overwrite_column,
        validate=args.validate,
        escape_pipes=args.escape_pipes,
        quantities_include_numbers=args.quantities_include_numbers,
        general_strings=args.general_strings,
        remove_prefixed_columns=args.remove_prefixed_columns,
        ignore_unselected_types=args.ignore_unselected_types,
        retain_unselected_types=args.retain_unselected_types,
        output_file_path=args.output_file_path,
        reject_file_path=args.reject_file_path,
        build_id=args.build_id,
        idbuilder_options=idbuilder_options,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    ex.process()
Beispiel #11
0
def main():
    """
    Test the KGTK compact processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(dest="input_file_path", help="The KGTK file with the input data (default=%(default)s)", type=Path, nargs="?", default="-")

    parser.add_argument(      "--columns", dest="key_column_names",
                              help="The key columns to identify records for compaction. " +
                              "(default=id for node files, (node1, label, node2, id) for edge files).", nargs='+', default=[ ])

    parser.add_argument(      "--compact-id", dest="compact_id",
                              help="Indicate that the ID column in KGTK edge files should be compacted. " +
                              "Normally, if the ID column exists, it is not compacted, " +
                              "as there are use cases that need to maintain distinct lists of secondary edges for each ID value. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--presorted", dest="sorted_input",
                              help="Indicate that the input has been presorted (or at least pregrouped). (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--verify-sort", dest="verify_sort",
                              help="If the input has been presorted, verify its consistency (disable if only pregrouped). (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=True)

    parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-")
    
    parser.add_argument(      "--build-id", dest="build_id",
                              help="Build id values in an id column. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    KgtkIdBuilderOptions.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.                                                                                                                          
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(args)    
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

   # Show the final option structures for debugging and documentation.                                                                                             
    if args.show_options:
        print("input: %s" % str(args.input_file_path), file=error_file, flush=True)
        print("--columns %s" % " ".join(args.key_column_names), file=error_file, flush=True)
        print("--compact-id=%s" % str(args.compact_id), file=error_file, flush=True)
        print("--presorted=%s" % str(args.sorted_input), file=error_file, flush=True)
        print("--verify-sort=%s" % str(args.verify_sort), file=error_file, flush=True)
        print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True)
        print("--build-id=%s" % str(args.build_id), file=error_file, flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kc: KgtkCompact = KgtkCompact(
        input_file_path=args.input_file_path,
        key_column_names=args.key_column_names,
        compact_id=args.compact_id,
        sorted_input=args.sorted_input,
        verify_sort=args.verify_sort,
        output_file_path=args.output_file_path,
        build_id=args.build_id,
        idbuilder_options=idbuilder_options,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    kc.process()
Beispiel #12
0
def main():
    """
    Test the KGTK ifempty processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(dest="input_file_path",
                        help="The KGTK file with the input data",
                        type=Path,
                        nargs="?")

    parser.add_argument("--columns",
                        dest="filter_column_names",
                        help="The columns to filter on (default=None).",
                        nargs='+',
                        required=True)

    parser.add_argument(
        "--count",
        dest="only_count",
        help="Only count the records, do not copy them. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--all",
        dest="all_are",
        help=
        "False: Test if any are, True: test if all are (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--not-empty",
        dest="notempty",
        help=
        "False: test if empty, True: test if not empty (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        # TODO: show ifempty-specific options.
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    ie: KgtkIfEmpty = KgtkIfEmpty(input_file_path=args.input_file_path,
                                  filter_column_names=args.filter_column_names,
                                  output_file_path=args.output_file_path,
                                  all_are=args.all_are,
                                  notempty=args.notempty,
                                  only_count=args.only_count,
                                  reader_options=reader_options,
                                  value_options=value_options,
                                  error_file=error_file,
                                  verbose=args.verbose,
                                  very_verbose=args.very_verbose)

    ie.process()
Beispiel #13
0
def main():
    """
    Test the KGTK copy template.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument("-i",
                        "--input-file",
                        dest="input_file_path",
                        help="The KGTK input file. (default=%(default)s)",
                        type=Path,
                        default="-")

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK output file. (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--reified-file",
        dest="reified_file_path",
        help=
        "A KGTK output file that will contain only the reified values. (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument(
        "--unreified-file",
        dest="unreified_file_path",
        help=
        "A KGTK output file that will contain only the unreified values. (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument(
        "--uninvolved-file",
        dest="uninvolved_file_path",
        help=
        "A KGTK output file that will contain only the uninvolved input records. (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument("--output-format",
                        dest="output_format",
                        help="The file format (default=kgtk)",
                        type=str,
                        choices=KgtkWriter.OUTPUT_FORMAT_CHOICES)

    KgtkUnreifyValues.add_arguments(parser)
    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=False, expert=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("--input-files %s" %
              " ".join([str(path) for path in input_file_paths]),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        if args.reified_file_path is not None:
            print("--reified-file=%s" % str(args.reified_file_path),
                  file=error_file,
                  flush=True)
        if args.unreified_file_path is not None:
            print("--unreified-file=%s" % str(args.unreified_file_path),
                  file=error_file,
                  flush=True)
        if args.uninvolved_file_path is not None:
            print("--uninvolved-file=%s" % str(args.uninvolved_file_path),
                  file=error_file,
                  flush=True)

        if args.output_format is not None:
            print("--output-format=%s" % args.output_format,
                  file=error_file,
                  flush=True)

        if args.trigger_label_value is not None:
            print("--trigger-label=%s" % args.trigger_label_value,
                  file=error_file,
                  flush=True)
        if args.trigger_node2_value is not None:
            print("--trigger-node2=%s" % args.trigger_node2_value,
                  file=error_file,
                  flush=True)
        if args.value_label_value is not None:
            print("--value-label=%s" % args.value_label_value,
                  file=error_file,
                  flush=True)
        if args.old_label_value is not None:
            print("--old-label=%s" % args.old_label_value,
                  file=error_file,
                  flush=True)
        if args.new_label_value is not None:
            print("--new-label=%s" % args.new_label_value,
                  file=error_file,
                  flush=True)

        print("--allow-multiple-values=%s" % str(args.allow_multiple_values),
              file=error_file,
              flush=True)
        print("--allow-extra-columns=%s" % str(args.allow_extra_columns),
              file=error_file,
              flush=True)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kuv: KgtkUnreifyValues = KgtkUnreifyValues(
        input_file_path=args.input_file_path,
        output_file_path=args.output_file_path,
        reified_file_path=args.reified_file_path,
        unreified_file_path=args.unreified_file_path,
        uninvolved_file_path=args.uninvolved_file_path,
        trigger_label_value=args.trigger_label_value,
        trigger_node2_value=args.trigger_node2_value,
        value_label_value=args.value_label_value,
        old_label_value=args.old_label_value,
        new_label_value=args.new_label_value,
        allow_multiple_values=args.allow_multiple_values,
        allow_extra_columns=args.allow_extra_columns,
        reader_options=reader_options,
        value_options=value_options,
        output_format=args.output_format,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose,
    )

    kuv.process()
Beispiel #14
0
def main():
    """
    Test the KGTK lift processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(dest="input_file_path",
                        help="The KGTK file with the input data",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--label-file",
        dest="label_file_path",
        help="A KGTK file with label records (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument(
        "--node1-name",
        dest="node1_column_name",
        help="The name of the node1 column. (default=node1 or alias).",
        default=None)

    parser.add_argument("--label-name",
                        dest="label_column_name",
                        help="The name of the label column. (default=label).",
                        default=None)

    parser.add_argument(
        "--node2-name",
        dest="node2_column_name",
        help="The name of the node2 column. (default=node2 or alias).",
        default=None)

    parser.add_argument(
        "--label-value",
        dest="label_column_value",
        help="The value in the label column. (default=%(default)s).",
        default="label")
    parser.add_argument(
        "--lift-suffix",
        dest="lifted_column_suffix",
        help=
        "The suffix used for newly created columns. (default=%(default)s).",
        default=";label")

    parser.add_argument(
        "--columns-to-lift",
        dest="lift_column_names",
        help="The columns to lift. (default=[node1, label, node2]).",
        nargs='*')

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--remove-label-records",
        dest="remove_label_records",
        help=
        "If true, remove label records from the output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--sort-lifted-labels",
        dest="sort_lifted_labels",
        help="If true, sort lifted labels with lists. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--suppress-duplicate-labels",
        dest="suppress_duplicate_labels",
        help=
        "If true, suppress duplicate values in lifted labels with lists (implies sorting). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--suppress-empty-columns",
        dest="suppress_empty_columns",
        help=
        "If true, do not create new columns that would be empty. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--ok-if-no-labels",
        dest="ok_if_no_labels",
        help=
        "If true, do not abort if no labels were found. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--input-file-is-presorted",
        dest="input_is_presorted",
        help=
        "If true, the input file is presorted on the column for which values are to be lifted. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--label-file-is-presorted",
        dest="labels_are_presorted",
        help=
        "If true, the label file is presorted on the node1 column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    KgtkReader.add_debug_arguments(parser)
    # TODO: seperate reader options for the label file.
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        if args.label_file_path is not None:
            print("--label-file=%s" % str(args.label_file_path),
                  file=error_file,
                  flush=True)
        if args.node1_column_name is not None:
            print("--node1-name=%s" % args.node1_column_name,
                  file=error_file,
                  flush=True)
        if args.label_column_name is not None:
            print("--label-name=%s" % args.label_column_name,
                  file=error_file,
                  flush=True)
        if args.node2_column_name is not None:
            print("--node2-name=%s" % args.node2_column_name,
                  file=error_file,
                  flush=True)
        print("--label-value=%s" % args.label_column_value,
              file=error_file,
              flush=True)
        print("--lift-suffix=%s" % args.lifted_column_suffix,
              file=error_file,
              flush=True)
        if args.lift_column_names is not None and len(
                args.lift_column_names) > 0:
            print("--columns-to-lift %s" % " ".join(args.lift_column_names),
                  file=error_file,
                  flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        print("--remove-label-records=%s" % str(args.remove_label_records))
        print("--sort-lifted-labels-labels=%s" % str(args.sort_lifted_labels))
        print("--suppress-duplicate-labels=%s" %
              str(args.suppress_duplicate_labels))
        print("--suppress-empty-columns=%s" % str(args.suppress_empty_columns))
        print("--ok-if-no-labels=%s" % str(args.ok_if_no_labels))
        print("--input-file-is-presorted=%s" % str(args.input_is_presorted))
        print("--label-file-is-presorted=%s" % str(args.labels_are_presorted))
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kl: KgtkLift = KgtkLift(
        input_file_path=args.input_file_path,
        label_file_path=args.label_file_path,
        node1_column_name=args.node1_column_name,
        label_column_name=args.label_column_name,
        node2_column_name=args.node2_column_name,
        label_column_value=args.label_column_value,
        lifted_column_suffix=args.lifted_column_suffix,
        lift_column_names=args.lift_column_names,
        output_file_path=args.output_file_path,
        remove_label_records=args.remove_label_records,
        sort_lifted_labels=args.sort_lifted_labels,
        suppress_duplicate_labels=args.suppress_duplicate_labels,
        suppress_empty_columns=args.suppress_empty_columns,
        ok_if_no_labels=args.ok_if_no_labels,
        input_is_presorted=args.input_is_presorted,
        labels_are_presorted=args.labels_are_presorted,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    kl.process()
Beispiel #15
0
def main():
    """
    Test the KGTK file concatenator.
    """
    parser = ArgumentParser()
    parser.add_argument(dest="input_file_paths",
                        help="The KGTK files to concatenate",
                        type=Path,
                        nargs='+')
    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s)",
                        type=Path,
                        default="-")

    parser.add_argument("--output-format",
                        dest="output_format",
                        help="The file format (default=kgtk)",
                        type=str,
                        choices=KgtkWriter.OUTPUT_FORMAT_CHOICES)

    parser.add_argument(
        "--output-columns",
        dest="output_column_names",
        help="Rename all output columns. (default=%(default)s)",
        type=str,
        nargs='+')
    parser.add_argument(
        "--old-columns",
        dest="old_column_names",
        help="Rename seleted output columns: old names. (default=%(default)s)",
        type=str,
        nargs='+')
    parser.add_argument(
        "--new-columns",
        dest="new_column_names",
        help="Rename seleted output columns: new names. (default=%(default)s)",
        type=str,
        nargs='+')

    KgtkReader.add_debug_arguments(parser, expert=True)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True)
    KgtkValueOptions.add_arguments(parser, expert=True)

    args = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        input_files: typing.List[str] = []
        input_file: Path
        for input_file in args.input_file_paths:
            input_files.append(str(input_file))
        print("input: %s" % " ".join(input_files), file=error_file, flush=True)
        print("--output-file=%s" % args.output_file_path,
              file=error_file,
              flush=True)
        if args.output_format is not None:
            print("--output-format=%s" % args.output_format,
                  file=error_file,
                  flush=True)
        if args.output_column_names is not None:
            print("--output-columns=%s" % " ".join(args.output_column_names),
                  file=error_file,
                  flush=True)
        if args.old_column_names is not None:
            print("--old-columns=%s" % " ".join(args.old_column_names),
                  file=error_file,
                  flush=True)
        if args.new_column_names is not None:
            print("--new-columns=%s" % " ".join(args.new_column_names),
                  file=error_file,
                  flush=True)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kc: KgtkCat = KgtkCat(input_file_paths=args.input_file_paths,
                          output_path=args.output_file_path,
                          output_format=args.output_format,
                          output_column_names=args.output_column_names,
                          old_column_names=args.old_column_names,
                          new_column_names=args.new_column_names,
                          reader_options=reader_options,
                          value_options=value_options,
                          error_file=error_file,
                          verbose=args.verbose,
                          very_verbose=args.very_verbose)

    kc.process()
Beispiel #16
0
def main():
    """
    Test the KGTK copy template.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument("-i",
                        "--input-file",
                        dest="input_file_path",
                        help="The KGTK input file. (default=%(default)s)",
                        type=Path,
                        default="-")

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK output file. (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--keygen",
        dest="keygen",
        help="The KGTK key generation procedure. (default=%(default)s).",
        type=str,
        default="node1")

    parser.add_argument(
        "--group-sort",
        dest="group_sort",
        help="If true, use the grouped sort and buffer. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--group-iterate",
        dest="group_iterate",
        help="If true, us the grouped iteration. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, expert=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("--input-files %s" %
              " ".join([str(path) for path in input_file_paths]),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        print("--keygen=%s" % str(args.keygen), file=error_file, flush=True)
        print("--group-sort=%s" % str(args.group_sort),
              file=error_file,
              flush=True)
        print("--group-iterate=%s" % str(args.group_iterate),
              file=error_file,
              flush=True)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    ksbt: KgtkSortBufferTest = KgtkSortBufferTest(
        input_file_path=args.input_file_path,
        output_file_path=args.output_file_path,
        keygen=args.keygen,
        group_sort=args.group_sort,
        group_iterate=args.group_iterate,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose,
    )

    ksbt.process()
Beispiel #17
0
def main():
    """
    Test the KGTK file joiner.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(dest="input_file_path", help="The KGTK file with the input data", type=Path, nargs="?")

    parser.add_argument(      "--filter-on", dest="filter_file_path", help="The KGTK file with the filter data (required).", type=Path, required=True)

    parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-")
    
    parser.add_argument(      "--field-separator", dest="field_separator", help="Separator for multifield keys (default=%(default)s)",
                              default=KgtkIfExists.FIELD_SEPARATOR_DEFAULT)
   
    parser.add_argument(      "--invert", dest="invert", help="Invert the test (if not exists) (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--cache-input", dest="cache_input", help="Cache the input file instead of the filter keys. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--preserve-order", dest="preserve_order", help="Preserve record order when cacheing the input file. (default=%(default)s).",
                              type=optional_bool, nargs='?', const=True, default=False)

    parser.add_argument(      "--input-keys", dest="input_keys", help="The key columns in the input file (default=None).", nargs='*')
    parser.add_argument(      "--filter-keys", dest="filter_keys", help="The key columns in the filter file (default=None).", nargs='*')

    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True, who="input")
    KgtkReaderOptions.add_arguments(parser, mode_options=True, who="filter")
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.                                                                                                                          
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who="input")
    filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args, who="filter")
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

   # Show the final option structures for debugging and documentation.                                                                                             
    if args.show_options:
        print("input: %s" % (str(args.input_file_path) if args.input_file_path is not None else "-"), file=error_file)
        print("--filter-on=%s" % str(args.filter_file_path), file=error_file)
        print("--output-file=%s" % str(args.output_file_path), file=error_file)
        print("--field-separator=%s" % repr(args.field_separator), file=error_file)
        print("--invert=%s" % str(args.invert), file=error_file)
        print("--cache-input=%s" % str(args.cache_input), file=error_file)
        print("--preserve-order=%s" % str(args.preserve_order), file=error_file)
        if args.input_keys is not None:
            print("--input-keys %s" % " ".join(args.input_keys), file=error_file)
        if args.filter_keys is not None:
            print("--filter-keys %s" % " ".join(args.filter_keys), file=error_file)
        input_reader_options.show(out=error_file, who="input")
        filter_reader_options.show(out=error_file, who="filter")
        value_options.show(out=error_file)

    ie: KgtkIfExists = KgtkIfExists(
        input_file_path=args.input_file_path,
        input_keys=args.input_keys,
        filter_file_path=args.filter_file_path,
        filter_keys=args.filter_keys,
        output_file_path=args.output_file_path,
        field_separator=args.field_separator,
        invert=args.invert,
        cache_input=args.cache_input,
        preserve_order=args.preserve_order,
        input_reader_options=input_reader_options,
        filter_reader_options=filter_reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    ie.process()
Beispiel #18
0
def main():
    """
    Test the KGTK lift processor.
    """
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument(dest="input_file_path",
                        help="The KGTK file with the input data",
                        type=Path,
                        default="-")

    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    parser.add_argument(
        "--label-file",
        dest="label_file_path",
        help="An optional KGTK file with label records (default=%(default)s).",
        type=Path,
        default=None)

    parser.add_argument(
        "--input-select-column",
        "--input-label-column",
        dest="input_select_column_name",
        help="If input record selection is enabled by --input-select-value, " +
        "the name of a column that determines which records received lifted values. "
        + "The default is the 'label' column or its alias.",
        default=None)

    parser.add_argument(
        "--input-select-value",
        "--input-label-value",
        "--target-label-value",
        dest="input_select_column_value",
        help=
        "The value in the input select column that identifies a record to receive lifted values. "
        + "The default is not to perform input record selection, " +
        "and all input records except label records may receive lifted values. ",
        default=None)

    parser.add_argument(
        "--columns-to-lift",
        dest="input_lifting_column_names",
        help="The columns for which matching labels are to be lifted. " +
        "The default is [node1, label, node2] or their aliases.",
        nargs='*')

    parser.add_argument(
        "--lift-suffix",
        dest="output_lifted_column_suffix",
        help=
        "The suffix used for newly created output columns. (default=%(default)s).",
        default=KgtkLift.DEFAULT_OUTPUT_LIFTED_COLUMN_SUFFIX)

    parser.add_argument(
        "--update-select-value",
        "--target-new-label-value",
        dest="output_select_column_value",
        help=
        "A new value for the select (label) column for records that received lifted values. "
        + "The default is not to update the select(label) column.",
        default=None)

    parser.add_argument(
        "--label-select-column",
        "--label-name",
        dest="label_select_column_name",
        help=
        "The name of the column that contains a special value that identifies label records. "
        + "The default is 'label' or its alias.",
        default=None)

    parser.add_argument(
        "-p",
        "--label-select-value",
        "--label-value",
        "--property",
        dest="label_select_column_value",
        help=
        "The special value in the label select column that identifies a label record. "
        + "(default=%(default)s).",
        default=KgtkLift.DEFAULT_LABEL_SELECT_COLUMN_VALUE)

    parser.add_argument(
        "--label-match-column",
        "--node1-name",
        dest="label_match_column_name",
        help=
        "The name of the column in the label records that contains the value "
        +
        "that matches the value in a column being lifted in the input records. "
        + "The default is 'node1' or its alias.",
        default=None)

    parser.add_argument(
        "--label-value-column",
        "--node2-name",
        "--lift-from",
        dest="label_value_column_name",
        help=
        "The name of the column in the label record that contains the value " +
        "to be lifted into the input record that is receiving lifted values. "
        + "The default is 'node2' or its alias.",
        default=None)

    parser.add_argument(
        "--remove-label-records",
        dest="remove_label_records",
        help=
        "If true, remove label records from the output. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--sort-lifted-labels",
        dest="sort_lifted_labels",
        help="If true, sort lifted labels with lists. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--suppress-duplicate-labels",
        dest="suppress_duplicate_labels",
        help=
        "If true, suppress duplicate values in lifted labels with lists (implies sorting). (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=True)

    parser.add_argument(
        "--suppress-empty-columns",
        dest="suppress_empty_columns",
        help=
        "If true, do not create new columns that would be empty. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--ok-if-no-labels",
        dest="ok_if_no_labels",
        help=
        "If true, do not abort if no labels were found. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--prefilter-labels",
        dest="prefilter_labels",
        help=
        "If true, read the input file before reading the label file. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--input-file-is-presorted",
        dest="input_is_presorted",
        help=
        "If true, the input file is presorted on the column for which values are to be lifted. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    parser.add_argument(
        "--label-file-is-presorted",
        dest="labels_are_presorted",
        help=
        "If true, the label file is presorted on the node1 column. (default=%(default)s).",
        type=optional_bool,
        nargs='?',
        const=True,
        default=False)

    KgtkReader.add_debug_arguments(parser)
    # TODO: seperate reader options for the label file.
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        if args.label_file_path is not None:
            print("--label-file=%s" % str(args.label_file_path),
                  file=error_file,
                  flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)

        if args.input_select_column_name is not None:
            print("--input-select-column=%s" % args.input_select_column_name,
                  file=error_file,
                  flush=True)
        if args.input_select_column_value is not None:
            print("--input-select-value=%s" % args.input_select_column_value,
                  file=error_file,
                  flush=True)
        if args.input_lifting_column_names is not None and len(
                args.input_lifting_column_names) > 0:
            print("--columns-to-lift %s" %
                  " ".join(args.input_lifting_column_names),
                  file=error_file,
                  flush=True)

        print("--lift-suffix=%s" % args.output_lifted_column_suffix,
              file=error_file,
              flush=True)
        if args.output_select_column_value is not None:
            print("--update-label-value=%s" % args.output_select_column_value,
                  file=error_file,
                  flush=True)

        if args.label_select_column_name is not None:
            print("--label-select-column=%s" % args.label_select_column_name,
                  file=error_file,
                  flush=True)
        if args.label_select_column_value is not None:
            print("--label-select-value=%s" % args.label_select_column_value,
                  file=error_file,
                  flush=True)
        if args.label_match_column_name is not None:
            print("--label-match-column=%s" % args.label_match_column_name,
                  file=error_file,
                  flush=True)
        if args.label_value_column_name is not None:
            print("--label-value-column=%s" % args.label_value_column_name,
                  file=error_file,
                  flush=True)

        print("--remove-label-records=%s" % str(args.remove_label_records))
        print("--sort-lifted-labels-labels=%s" % str(args.sort_lifted_labels))
        print("--suppress-duplicate-labels=%s" %
              str(args.suppress_duplicate_labels))
        print("--suppress-empty-columns=%s" % str(args.suppress_empty_columns))
        print("--ok-if-no-labels=%s" % str(args.ok_if_no_labels))
        print("--prefilter-labels=%s" % str(args.prefilter_labels))
        print("--input-file-is-presorted=%s" % str(args.input_is_presorted))
        print("--label-file-is-presorted=%s" % str(args.labels_are_presorted))
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    kl: KgtkLift = KgtkLift(
        input_file_path=args.input_file_path,
        label_file_path=args.label_file_path,
        output_file_path=args.output_file_path,
        input_select_column_name=args.input_select_column_name,
        input_select_column_value=args.input_select_column_value,
        input_lifting_column_names=args.input_lifting_column_names,
        output_select_column_value=args.output_select_column_value,
        output_lifted_column_suffix=args.output_lifted_column_suffix,
        label_select_column_name=args.label_select_column_name,
        label_select_column_value=args.label_select_column_value,
        label_match_column_name=args.label_match_column_name,
        label_value_column_name=args.label_value_column_name,
        remove_label_records=args.remove_label_records,
        sort_lifted_labels=args.sort_lifted_labels,
        suppress_duplicate_labels=args.suppress_duplicate_labels,
        suppress_empty_columns=args.suppress_empty_columns,
        ok_if_no_labels=args.ok_if_no_labels,
        prefilter_labels=args.prefilter_labels,
        input_is_presorted=args.input_is_presorted,
        labels_are_presorted=args.labels_are_presorted,
        reader_options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=args.verbose,
        very_verbose=args.very_verbose)

    kl.process()