def run( input_file: KGTKFiles, output_file: KGTKFiles, filter_column_names: typing.List[str], all_are: bool = False, only_count: bool = False, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) print("--columns=%s" % " ".join(filter_column_names), file=error_file) print("--count=%s" % str(only_count), file=error_file) print("--all=%s" % str(all_are), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: ie: KgtkIfEmpty = KgtkIfEmpty( input_file_path=input_kgtk_file, filter_column_names=filter_column_names, output_file_path=output_kgtk_file, all_are=all_are, notempty=False, only_count=only_count, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) ie.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, key_column_names: typing.List[str], errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkexpand import KgtkExpand from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) print("--columns=%s" % " ".join(key_column_names), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: ex: KgtkExpand = KgtkExpand( input_file_path=input_kgtk_file, key_column_names=key_column_names, output_file_path=output_kgtk_file, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) ex.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run(input_file: KGTKFiles, output_file: KGTKFiles, no_header: bool = False, properties: str = '', undirected: bool = False, strong: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: from kgtk.gt.connected_components import ConnectedComponents from kgtk.exceptions import KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) cc: ConnectedComponents = ConnectedComponents(input_file_path=input_kgtk_file, output_file_path=output_kgtk_file, no_header=no_header, properties=properties, undirected=undirected, strong=strong) try: cc.process() return 0 except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, output_format: str, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import KGTKException input_file_path: Path = KGTKArgumentParser.get_input_file(input_file) output_file_path: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # TODO: check that at most one input file is stdin? # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_file_path), file=error_file, flush=True) print("--output-file=%s" % str(output_file_path), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: kc: KgtkCat = KgtkCat(input_file_paths=[input_file_path], output_path=output_file_path, output_format=output_format, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) kc.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run(input_file, output_file, columns='1', reverse=False, space=False, speed=False, extra='', tsv=False, csv=False, _dt=None, naptime=1): import time import kgtk.cli.zconcat as zcat time.sleep(int(naptime)) # print("Sort running.", file=sys.stderr, flush=True) # *** input = str(KGTKArgumentParser.get_input_file(input_file)) output = str(KGTKArgumentParser.get_output_file(output_file)) if output == "-": output = None # logging.basicConfig(level=logging.INFO) """Run sort according to the provided command-line arguments. """ try: colsep = '\t' if not tsv and (csv or _dt == 'csv'): colsep = ',' options = extra if reverse: options += ' -r' if space: options += ' ' + space_config elif speed: options += ' ' + speed_config pipe = build_command(input=input, output=output, columns=columns, colsep=colsep, options=options) # print("pipe: %s" % str(pipe), file=sys.stderr, flush=True) # *** return zcat.run_sh_commands(pipe).exit_code except sh.SignalException_SIGPIPE: # hack to work around Python3 issue when stdout is gone when we try to report an exception; # without this we get an ugly 'Exception ignored...' msg when we quit with head or a pager: sys.stdout = os.fdopen(1) except Exception as e: #import traceback #traceback.print_tb(sys.exc_info()[2], 10) raise KGTKException('INTERNAL ERROR: ' + type(e).__module__ + '.' + str(e) + '\n')
def run( input_file: KGTKFiles, output_file: KGTKFiles, properties: str = '', undirected: bool = False, strong: bool = False, # The following have been modified to postpone importing gtaph_tools. # ClusterComponents cann't be referenced here. cluster_name_method: typing.Optional[typing.Any] = None, cluster_name_separator: typing.Optional[str] = None, cluster_name_prefix: typing.Optional[str] = None, cluster_name_zfill: typing.Optional[int] = None, minimum_cluster_size: typing.Optional[int] = None, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: from pathlib import Path from kgtk.exceptions import KGTKException from kgtk.gt.connected_components import ConnectedComponents from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # It's OK to mention ClusterComponents here. cluster_name_method_x: ConnectedComponents.Method = \ cluster_name_method if cluster_name_method is not None else ConnectedComponents.DEFAULT_CLUSTER_NAME_METHOD cluster_name_separator = ConnectedComponents.DEFAULT_CLUSTER_NAME_SEPARATOR if cluster_name_separator is None else cluster_name_separator cluster_name_prefix = ConnectedComponents.DEFAULT_CLUSTER_NAME_PREFIX if cluster_name_prefix is None else cluster_name_prefix cluster_name_zfill = ConnectedComponents.DEFAULT_CLUSTER_NAME_ZFILL if cluster_name_zfill is None else cluster_name_zfill minimum_cluster_size = ConnectedComponents.DEFAULT_MINIMUM_CLUSTER_SIZE if minimum_cluster_size is None else minimum_cluster_size cc: ConnectedComponents = ConnectedComponents( input_file_path=input_kgtk_file, output_file_path=output_kgtk_file, properties=properties, undirected=undirected, strong=strong, cluster_name_method=cluster_name_method_x, cluster_name_separator=cluster_name_separator, cluster_name_prefix=cluster_name_prefix, cluster_name_zfill=cluster_name_zfill, minimum_cluster_size=minimum_cluster_size, ) try: cc.process() return 0 except Exception as e: raise KGTKException(str(e))
def run(input_file: KGTKFiles, output_file: KGTKFiles, label_file: KGTKFiles, unmodified_row_file: KGTKFiles, matched_label_file: KGTKFiles, unmatched_label_file: KGTKFiles, input_select_column_name: typing.Optional[str], input_select_column_value: typing.Optional[str], input_lifting_column_names: typing.List[str], output_lifted_column_names: typing.List[str], output_lifted_column_suffix: str, output_select_column_value: str, label_select_column_name: typing.Optional[str], label_select_column_value: str, label_match_column_name: typing.Optional[str], label_value_column_name: typing.Optional[str], default_value: str, remove_label_records: bool = False, sort_lifted_labels: bool = True, suppress_duplicate_labels: bool = True, suppress_empty_columns: bool = False, ok_if_no_labels: bool = False, prefilter_labels: bool = False, input_is_presorted: bool = False, labels_are_presorted: bool = False, clear_before_lift: bool = False, overwrite: bool = False, output_only_modified_rows: bool = False, languages: typing.Optional[typing.List[str]] = None, prioritize: bool = False, use_label_envar: bool = False, lift_all_columns: bool = False, require_label_file: bool = False, force_input_mode_none: bool = False, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally import os from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.lift.kgtklift import KgtkLift from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_input_file(label_file, who="KGTK label file") unmodified_row_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_row_file, who="KGTK unmodified row output file") matched_label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(matched_label_file, who="KGTK matched label output file") unmatched_label_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmatched_label_file, who="KGTK unmatched label output file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True) label_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="label", fallback=True) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if label_kgtk_file is not None: print("--label-file=%s" % label_kgtk_file, file=error_file, flush=True) if unmodified_row_kgtk_file is not None: print("--unmodified-row-output-file=%s" % unmodified_row_kgtk_file, file=error_file, flush=True) if matched_label_kgtk_file is not None: print("--matched-label-output-file=%s" % matched_label_kgtk_file, file=error_file, flush=True) if unmatched_label_kgtk_file is not None: print("--unmatched-label-output-file=%s" % unmatched_label_kgtk_file, file=error_file, flush=True) if input_select_column_name is not None: print("--input-select-column=%s" % input_select_column_name, file=error_file, flush=True) if input_select_column_value is not None: print("--input-select-value=%s" % input_select_column_value, file=error_file, flush=True) if input_lifting_column_names is not None and len(input_lifting_column_names) > 0: print("--columns-to-lift %s" % " ".join(input_lifting_column_names), file=error_file, flush=True) if output_lifted_column_names is not None and len(output_lifted_column_names) > 0: print("--columns-to-write %s" % " ".join(output_lifted_column_names), file=error_file, flush=True) print("--lift-suffix=%s" % output_lifted_column_suffix, file=error_file, flush=True) if output_select_column_value is not None: print("--update-select-value=%s" % output_select_column_value, file=error_file, flush=True) if label_select_column_name is not None: print("--label-select-column=%s" % label_select_column_name, file=error_file, flush=True) print("--label-select-value=%s" % label_select_column_value, file=error_file, flush=True) if label_match_column_name is not None: print("--label-match-column=%s" % label_match_column_name, file=error_file, flush=True) if label_value_column_name is not None: print("--label-value-column=%s" % label_value_column_name, file=error_file, flush=True) print("--default-value=%s" % repr(default_value), file=error_file, flush=True) print("--remove-label-records=%s" % repr(remove_label_records), file=error_file, flush=True) print("--sort-lifted-labels=%s" % repr(sort_lifted_labels), file=error_file, flush=True) print("--suppress-duplicate-labels=%s" % repr(suppress_duplicate_labels), file=error_file, flush=True) print("--suppress-empty-columns=%s" % repr(suppress_empty_columns), file=error_file, flush=True) print("--ok-if-no-labels=%s" % repr(ok_if_no_labels), file=error_file, flush=True) print("--prefilter-labels=%s" % repr(prefilter_labels), file=error_file, flush=True) print("--input-file-is-presorted=%s" % repr(input_is_presorted), file=error_file, flush=True) print("--label-file-is-presorted=%s" % repr(labels_are_presorted), file=error_file, flush=True) print("--clear-before-lift=%s" % repr(clear_before_lift), file=error_file, flush=True) print("--overwrite=%s" % repr(overwrite), file=error_file, flush=True) print("--output-only-modified-rows=%s" % repr(output_only_modified_rows), file=error_file, flush=True) if languages is not None: print("--languages %s" % " ".join(repr(l) for l in languages), file=error_file, flush=True) print("--prioritize=%s" % repr(prioritize), file=error_file, flush=True) print("--use-label-envar=%s" % repr(use_label_envar), file=error_file, flush=True) print("--lift-all-columns=%s" % repr(lift_all_columns), file=error_file, flush=True) print("--require-label-files=%s" % repr(require_label_file), file=error_file, flush=True) print("--force-input-mode-none=%s" % repr(force_input_mode_none), file=error_file, flush=True) input_reader_options.show(out=error_file, who="input") label_reader_options.show(out=error_file, who="label") value_options.show(out=error_file) print("=======", file=error_file, flush=True) # Should the following functionality be moved to KgtkLift? if label_kgtk_file is None and use_label_envar: label_file_envar: str = 'KGTK_LABEL_FILE' # TODO: Move this to a common file. label_file_envar_value: typing.Optional[str] = os.getenv(label_file_envar) if label_file_envar_value is not None: label_kgtk_file = Path(label_file_envar_value) if verbose: print("Using label file %s from envar %s" % (repr(label_file_envar_value), repr(label_file_envar)), file=error_file, flush=True) if require_label_file and label_kgtk_file is None: raise KGTKException("A label file must be specified using --label-file or KGTK_LABEL_FILE") try: kl: KgtkLift = KgtkLift( input_file_path=input_kgtk_file, label_file_path=label_kgtk_file, output_file_path=output_kgtk_file, unmodified_row_file_path=unmodified_row_kgtk_file, matched_label_file_path=matched_label_kgtk_file, unmatched_label_file_path=unmatched_label_kgtk_file, input_select_column_name=input_select_column_name, input_select_column_value=input_select_column_value, input_lifting_column_names=input_lifting_column_names, output_lifted_column_suffix=output_lifted_column_suffix, output_select_column_value=output_select_column_value, output_lifted_column_names=output_lifted_column_names, label_select_column_name=label_select_column_name, label_select_column_value=label_select_column_value, label_match_column_name=label_match_column_name, label_value_column_name=label_value_column_name, default_value=default_value, remove_label_records=remove_label_records, sort_lifted_labels=sort_lifted_labels, suppress_duplicate_labels=suppress_duplicate_labels, suppress_empty_columns=suppress_empty_columns, ok_if_no_labels=ok_if_no_labels, prefilter_labels=prefilter_labels, input_is_presorted=input_is_presorted, labels_are_presorted=labels_are_presorted, clear_before_lift=clear_before_lift, overwrite=overwrite, output_only_modified_rows=output_only_modified_rows, languages=languages, prioritize=prioritize, lift_all_columns=lift_all_columns, force_input_mode_none=force_input_mode_none, input_reader_options=input_reader_options, label_reader_options=label_reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) kl.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, filter_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, matched_filter_file: KGTKFiles, unmatched_filter_file: KGTKFiles, join_file: KGTKFiles, input_keys: typing.Optional[typing.List[str]], filter_keys: typing.Optional[typing.List[str]], cache_input: bool = False, preserve_order: bool = False, presorted: bool = False, field_separator: typing.Optional[str] = None, left_join: bool = False, right_join: bool = False, input_prefix: typing.Optional[str] = None, filter_prefix: typing.Optional[str] = None, join_output: bool = False, right_first: bool = False, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.iff.kgtkifexists import KgtkIfExists from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) filter_kgtk_file: Path = KGTKArgumentParser.get_input_file( filter_file, who="KGTK filter file") output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( reject_file, who="KGTK reject file") matched_filter_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( matched_filter_file, who="KGTK matched filter file") unmatched_filter_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( unmatched_filter_file, who="KGTK unmatched filter file") join_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( join_file, who="KGTK join file") if (str(input_kgtk_file) == "-" and str(filter_kgtk_file) == "-"): raise KGTKException( "My not use stdin for both --input-file and --filter-on files.") field_separator = KgtkIfExists.FIELD_SEPARATOR_DEFAULT if field_separator is None else field_separator # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="input", fallback=True) filter_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="filter", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--filter-file=%s" % str(filter_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file) if matched_filter_kgtk_file is not None: print("--matched-filter-file=%s" % str(matched_filter_kgtk_file), file=error_file) if unmatched_filter_kgtk_file is not None: print("--unmatched-filter-file=%s" % str(unmatched_filter_kgtk_file), file=error_file) if join_kgtk_file is not None: print("--join-file=%s" % str(join_kgtk_file), file=error_file) if input_keys is not None: print("--input-keys=%s" % " ".join(input_keys), file=error_file) if filter_keys is not None: print("--filter-keys=%s" % " ".join(filter_keys), file=error_file) print("--cache-input=%s" % str(cache_input), file=error_file) print("--preserve-order=%s" % str(preserve_order), file=error_file) print("--presortedr=%s" % str(presorted), file=error_file) print("--field-separator=%s" % repr(field_separator), file=error_file) print("--left-join=%s" % str(left_join), file=error_file) print("--right-join=%s" % str(right_join), file=error_file) if input_prefix is not None: print("--input-prefix=%s" % repr(input_prefix), file=error_file) if filter_prefix is not None: print("--filter-prefix=%s" % repr(filter_prefix), file=error_file) print("--join-output=%s" % str(join_output), file=error_file) print("--right-join-first=%s" % str(right_first), file=error_file) input_reader_options.show(out=error_file, who="input") filter_reader_options.show(out=error_file, who="filter") value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: ie: KgtkIfExists = KgtkIfExists( input_file_path=input_kgtk_file, input_keys=input_keys, filter_file_path=filter_kgtk_file, filter_keys=filter_keys, output_file_path=output_kgtk_file, reject_file_path=reject_kgtk_file, matched_filter_file_path=matched_filter_kgtk_file, unmatched_filter_file_path=unmatched_filter_kgtk_file, join_file_path=join_kgtk_file, left_join=left_join, right_join=right_join, input_prefix=input_prefix, filter_prefix=filter_prefix, join_output=join_output, right_first=right_first, invert=False, cache_input=cache_input, preserve_order=preserve_order, presorted=presorted, field_separator=field_separator, input_reader_options=input_reader_options, filter_reader_options=filter_reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) ie.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys import typing from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: # First create the KgtkReader. It provides parameters used by the ID # column builder. Next, create the ID column builder, which provides a # possibly revised list of column names for the KgtkWriter. Create # the KgtkWriter. Last, process the data stream. # Open the input file. kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) # Create the ID builder. idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options) # Open the output file. ew: KgtkWriter = KgtkWriter.open(idb.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=verbose, very_verbose=very_verbose) # Process the input file, building IDs. idb.process(kr, ew) # Clean up. ew.close() kr.close() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run(input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, column_name: str, prefix: str, type_names: typing.List[str], without_fields: typing.Optional[typing.List[str]], overwrite_column: bool, validate: bool, escape_pipes: bool, quantities_include_numbers: bool, general_strings: bool, remove_prefixed_columns: bool, ignore_unselected_types: bool, retain_unselected_types: bool, build_id: bool, show_data_types: bool, quiet: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.reshape.kgtkimplode import KgtkImplode from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(reject_file, who="KGTK reject file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file, flush=True) print("--column %s" % column_name, file=error_file, flush=True) print("--prefix %s" % prefix, file=error_file, flush=True) print("--overwrite %s" % str(overwrite_column), file=error_file, flush=True) print("--validate %s" % str(validate), file=error_file, flush=True) print("--escape-pipes %s" % str(escape_pipes), file=error_file, flush=True) print("--quantities-include-numbers %s" % str(quantities_include_numbers), file=error_file, flush=True) print("--general-strings %s" % str(general_strings), file=error_file, flush=True) print("--remove-prefixed-columns %s" % str(remove_prefixed_columns), file=error_file, flush=True) print("--ignore-unselected-types %s" % str(ignore_unselected_types), file=error_file, flush=True) print("--retain-unselected-types %s" % str(retain_unselected_types), file=error_file, flush=True) if type_names is not None: print("--types %s" % " ".join(type_names), file=error_file, flush=True) if without_fields is not None: print("--without %s" % " ".join(without_fields), file=error_file, flush=True) print("--show-data-types %s" % str(show_data_types), file=error_file, flush=True) print("--quiet %s" % str(quiet), file=error_file, flush=True) print("--build-id=%s" % str(build_id), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if show_data_types: data_type: str for data_type in KgtkFormat.DataType.choices(): print("%s" % data_type, file=error_file, flush=True) return 0 wf: typing.List[str] = without_fields if without_fields is not None else list() try: ex: KgtkImplode = KgtkImplode( input_file_path=input_kgtk_file, output_file_path=output_kgtk_file, reject_file_path=reject_kgtk_file, column_name=column_name, prefix=prefix, type_names=type_names, without_fields=wf, overwrite_column=overwrite_column, validate=validate, escape_pipes=escape_pipes, quantities_include_numbers=quantities_include_numbers, general_strings=general_strings, remove_prefixed_columns=remove_prefixed_columns, ignore_unselected_types=ignore_unselected_types, retain_unselected_types=retain_unselected_types, quiet=quiet, build_id=build_id, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) ex.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run(input_file: KGTKFiles, entity_label_files: KGTKFiles, output_file: KGTKFiles, label_properties: typing.Optional[typing.List[str]], description_properties: typing.Optional[typing.List[str]], isa_properties: typing.Optional[typing.List[str]], has_properties: typing.Optional[typing.List[str]], property_values: typing.Optional[typing.List[str]], sentence_label: str, explain: bool, presorted: bool, add_entity_labels_from_input: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.gt.lexicalize_utils import Lexicalize from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) entity_label_kgtk_files: typing.List[Path] = KGTKArgumentParser.get_input_file_list(entity_label_files, who="The entity label file(s)", default_stdin=False) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) if label_properties is None: label_properties = DEFAULT_LABEL_PROPERTIES if description_properties is None: description_properties = DEFAULT_DESCRIPTION_PROPERTIES if isa_properties is None: isa_properties = DEFAULT_ISA_PROPERTIES if has_properties is None: has_properties = DEFAULT_HAS_PROPERTIES if property_values is None: property_values = DEFAULT_PROPERTY_VALUES # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True) if len(entity_label_kgtk_files) > 0: print("--entity-label-files %s" % " ".join([str(f) for f in entity_label_kgtk_files]), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if len(label_properties) > 0: print("--label-properties %s" % " ".join(label_properties), file=error_file, flush=True) if len(description_properties) > 0: print("--description-properties %s" % " ".join(description_properties), file=error_file, flush=True) if len(isa_properties) > 0: print("--isa-properties %s" % " ".join(isa_properties), file=error_file, flush=True) if len(has_properties) > 0: print("--has-properties %s" % " ".join(has_properties), file=error_file, flush=True) if len(property_values) > 0: print("--property-values %s" % " ".join(property_values), file=error_file, flush=True) print("--sentence-label=%s" % str(sentence_label), file=error_file, flush=True) print("--explain=%s" % str(explain), file=error_file, flush=True) print("--presorted=%s" % str(presorted), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) lexer: Lexicalize = Lexicalize(label_properties, description_properties, isa_properties, has_properties, property_values, sentence_label, explain=explain, error_file=error_file, verbose=verbose, very_verbose=very_verbose) if len(entity_label_kgtk_files) > 0: lexer.load_entity_label_files(entity_label_kgtk_files, error_file, reader_options, value_options, label_properties=label_properties, verbose=verbose) kr: typing.Optional[KgtkReader] = None kw: typing.Optional[KgtkWriter] = None try: if verbose: print("Opening the input file %s" % str(input_kgtk_file), file=error_file, flush=True) kr = KgtkReader.open(input_kgtk_file, options=reader_options, value_options = value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) if kr.node1_column_idx < 0: raise KGTKException("Missing column: node1 or alias") if kr.label_column_idx < 0: raise KGTKException("Missing column: label or alias") if kr.node2_column_idx < 0: raise KGTKException("Missing column: node2 or alias") if verbose: print("node1 column index = {}".format(kr.node1_column_idx), file=error_file, flush=True) print("label column index = {}".format(kr.label_column_idx), file=error_file, flush=True) print("node2 column index = {}".format(kr.node2_column_idx), file=error_file, flush=True) output_columns: typing.List[str] = OUTPUT_COLUMNS.copy() if explain: output_columns.append("explaination") if verbose: print("Including an explaination column in the output.", file=error_file, flush=True) if verbose: print("Opening the output file %s" % str(output_kgtk_file), file=error_file, flush=True) kw = KgtkWriter.open(output_columns, output_kgtk_file, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=verbose, very_verbose=very_verbose, ) if presorted: lexer.process_presorted_input(kr, kw) else: lexer.process_unsorted_input(kr, kw, add_entity_labels=add_entity_labels_from_input) return 0 except Exception as e: raise KGTKException(str(e)) finally: if kw is not None: kw.close() if kr is not None: kr.close()
def run( input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, pattern: str, subj_col: typing.Optional[str], pred_col: typing.Optional[str], obj_col: typing.Optional[str], or_pattern: bool, invert: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( output_file, who="KGTK reject file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file) print("--pattern=%s" % str(pattern), file=error_file) if subj_col is not None: print("--subj=%s" % str(subj_col), file=error_file) if pred_col is not None: print("--pred=%s" % str(pred_col), file=error_file) if obj_col is not None: print("--obj=%s" % str(obj_col), file=error_file) print("--or=%s" % str(or_pattern), file=error_file) print("--invert=%s" % str(invert), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) def prepare_filter(pattern: str) -> typing.Set[str]: filt: typing.Set[str] = set() pattern = pattern.strip() if len(pattern) == 0: return filt target: str for target in pattern.split(","): target = target.strip() if len(target) > 0: filt.add(target) return filt try: patterns: typing.List[str] = pattern.split(";") if len(patterns) != 3: print( "Error: The pattern must have three sections separated by semicolons (two semicolons total).", file=error_file, flush=True) raise KGTKException("Bad pattern") subj_filter: typing.Set[str] = prepare_filter(patterns[0]) pred_filter: typing.Set[str] = prepare_filter(patterns[1]) obj_filter: typing.Set[str] = prepare_filter(patterns[2]) apply_subj_filter: bool = len(subj_filter) > 0 apply_pred_filter: bool = len(pred_filter) > 0 apply_obj_filter: bool = len(obj_filter) > 0 if verbose and not (apply_subj_filter or apply_pred_filter or apply_obj_filter): print("Warning: the filter is empty.", file=error_file, flush=True) if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) subj_idx: int = kr.get_node1_column_index(subj_col) pred_idx: int = kr.get_label_column_index(pred_col) obj_idx: int = kr.get_node2_column_index(obj_col) # Complain about a missing column only when it is needed by the pattern. trouble: bool = False if subj_idx < 0 and len(subj_filter) > 0: trouble = True print("Error: Cannot find the subject column '%s'." % kr.get_node1_canonical_name(subj_col), file=error_file, flush=True) if pred_idx < 0 and len(pred_filter) > 0: trouble = True print("Error: Cannot find the predicate column '%s'." % kr.get_label_canonical_name(pred_col), file=error_file, flush=True) if obj_idx < 0 and len(obj_filter) > 0: trouble = True print("Error: Cannot find the object column '%s'." % kr.get_node2_canonical_name(obj_col), file=error_file, flush=True) if trouble: raise KGTKException("Missing columns.") if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(kr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) rw: typing.Optional[KgtkWriter] = None if reject_kgtk_file is not None: if verbose: print("Opening the reject file: %s" % str(reject_kgtk_file), file=error_file, flush=True) rw = KgtkWriter.open(kr.column_names, reject_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 subj_filter_keep_count: int = 0 pred_filter_keep_count: int = 0 obj_filter_keep_count: int = 0 subj_filter_reject_count: int = 0 pred_filter_reject_count: int = 0 obj_filter_reject_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 keep: bool = False reject: bool = False if apply_subj_filter: if row[subj_idx] in subj_filter: keep = True subj_filter_keep_count += 1 else: reject = True subj_filter_reject_count += 1 if apply_pred_filter: if row[pred_idx] in pred_filter: keep = True pred_filter_keep_count += 1 else: reject = True pred_filter_reject_count += 1 if apply_obj_filter: if row[obj_idx] in obj_filter: keep = True obj_filter_keep_count += 1 else: reject = True obj_filter_reject_count += 1 if (not keep ^ invert) if or_pattern else (reject ^ invert): if rw is not None: rw.write(row) reject_line_count += 1 else: kw.write(row) output_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count)) print("Keep counts: subject=%d, predicate=%d, object=%d." % (subj_filter_keep_count, pred_filter_keep_count, obj_filter_keep_count)) print("Reject counts: subject=%d, predicate=%d, object=%d." % (subj_filter_reject_count, pred_filter_reject_count, obj_filter_reject_count)) kw.close() if rw is not None: rw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run( input_file: KGTKFiles, output_file: KGTKFiles, label_file: KGTKFiles, input_select_column_name: typing.Optional[str], input_select_column_value: typing.Optional[str], input_lifting_column_names: typing.List[str], output_lifted_column_names: typing.List[str], output_lifted_column_suffix: str, output_select_column_value: str, label_select_column_name: typing.Optional[str], label_select_column_value: str, label_match_column_name: typing.Optional[str], label_value_column_name: typing.Optional[str], remove_label_records: bool = False, sort_lifted_labels: bool = True, suppress_duplicate_labels: bool = True, suppress_empty_columns: bool = False, ok_if_no_labels: bool = False, prefilter_labels: bool = False, input_is_presorted: bool = False, labels_are_presorted: bool = False, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.lift.kgtklift import KgtkLift from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) label_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_input_file( label_file, who="KGTK label file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if label_kgtk_file is not None: print("-label-file=%s" % label_kgtk_file, file=error_file, flush=True) if input_select_column_name is not None: print("--input-select-column=%s" % input_select_column_name, file=error_file, flush=True) if input_select_column_value is not None: print("--input-select-value=%s" % input_select_column_value, file=error_file, flush=True) if input_lifting_column_names is not None and len( input_lifting_column_names) > 0: print("--columns-to-lift %s" % " ".join(input_lifting_column_names), file=error_file, flush=True) if output_lifted_column_names is not None and len( output_lifted_column_names) > 0: print("--columns-to-write %s" % " ".join(output_lifted_column_names), file=error_file, flush=True) print("--lift-suffix=%s" % output_lifted_column_suffix, file=error_file, flush=True) if output_select_column_value is not None: print("--update-select-value=%s" % output_select_column_value, file=error_file, flush=True) if label_select_column_name is not None: print("--label-select-column=%s" % label_select_column_name, file=error_file, flush=True) print("--label-select-value=%s" % label_select_column_value, file=error_file, flush=True) if label_match_column_name is not None: print("--label-match-column=%s" % label_match_column_name, file=error_file, flush=True) if label_value_column_name is not None: print("--label-value-column=%s" % label_value_column_name, file=error_file, flush=True) print("--remove-label-records=%s" % str(remove_label_records)) print("--sort-lifted-labels=%s" % str(sort_lifted_labels)) print("--suppress-duplicate-labels=%s" % str(suppress_duplicate_labels)) print("--suppress-empty-columns=%s" % str(suppress_empty_columns)) print("--ok-if-no-labels=%s" % str(ok_if_no_labels)) print("--prefilter-labels=%s" % str(prefilter_labels)) print("--input-file-is-presorted=%s" % str(input_is_presorted)) print("--label-file-is-presorted=%s" % str(labels_are_presorted)) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: kl: KgtkLift = KgtkLift( input_file_path=input_kgtk_file, label_file_path=label_kgtk_file, output_file_path=output_kgtk_file, input_select_column_name=input_select_column_name, input_select_column_value=input_select_column_value, input_lifting_column_names=input_lifting_column_names, output_lifted_column_suffix=output_lifted_column_suffix, output_select_column_value=output_select_column_value, output_lifted_column_names=output_lifted_column_names, label_select_column_name=label_select_column_name, label_select_column_value=label_select_column_value, label_match_column_name=label_match_column_name, label_value_column_name=label_value_column_name, remove_label_records=remove_label_records, sort_lifted_labels=sort_lifted_labels, suppress_duplicate_labels=suppress_duplicate_labels, suppress_empty_columns=suppress_empty_columns, ok_if_no_labels=ok_if_no_labels, prefilter_labels=prefilter_labels, input_is_presorted=input_is_presorted, labels_are_presorted=labels_are_presorted, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) kl.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, column_name: str, type_names: typing.List[str], field_names: typing.List[str], prefix: str, overwrite_columns: bool, expand_list: bool, show_data_types: bool, output_format: typing.Optional[str], errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkexplode import KgtkExplode from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) print("--column %s" % column_name, file=error_file, flush=True) print("--prefix %s" % prefix, file=error_file, flush=True) print("--overwrite %s" % str(overwrite_columns), file=error_file, flush=True) print("--expand %s" % str(expand_list), file=error_file, flush=True) if type_names is not None: print("--types %s" % " ".join(type_names), file=error_file, flush=True) if field_names is not None: print("--fields %s" % " ".join(field_names), file=error_file, flush=True) print("--show-data-types %s" % str(show_data_types), file=error_file, flush=True) if output_format is not None: print("--output-format=%s" % output_format, file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if show_data_types: data_type: str for data_type in KgtkFormat.DataType.choices(): print("%s" % data_type, file=error_file, flush=True) return 0 try: ex: KgtkExplode = KgtkExplode( input_file_path=input_kgtk_file, output_file_path=output_kgtk_file, output_format=output_format, column_name=column_name, type_names=type_names, field_names=field_names, prefix=prefix, overwrite_columns=overwrite_columns, expand_list=expand_list, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) ex.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, column_names: typing.Optional[typing.List[str]] = None, empty_value: str = "", label_value: str = "count", output_format: str = "edge", prefix: str = "", where_column_name: typing.Optional[str] = None, where_values: typing.Optional[typing.List[str]] = None, value_filter: str = "", value_match_type: str = "match", presorted: bool = False, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.unique import Unique from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if column_names is not None: print("--columns %s" % " ".join(column_names), file=error_file, flush=True) print("--empty=%s" % str(empty_value), file=error_file) print("--label=%s" % str(label_value), file=error_file) print("--format=%s" % output_format, file=error_file) print("--prefix=%s" % prefix, file=error_file) if where_column_name is not None: print("--where=%s" % where_column_name, file=error_file) if where_values is not None and len(where_values) > 0: print("--in=%s" % " ".join(where_values), file=error_file) print("--value-filter=%s" % repr(value_filter), file=error_file) print("--value-match-type=%s" % repr(value_match_type), file=error_file) print("--prefix=%s" % repr(presorted), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: uniq: Unique = Unique( input_file_path=input_kgtk_file, output_file_path=output_kgtk_file, column_names=column_names, label_value=label_value, empty_value=empty_value, output_format=output_format, prefix=prefix, where_column_name=where_column_name, where_values=where_values, value_filter=value_filter, value_match_type=value_match_type, presorted=presorted, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) uniq.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, output_format: typing.Optional[str], column_names: typing.List[str], omit_remaining_columns: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if output_format is not None: print("--output-format=%s" % output_format, file=error_file, flush=True) print("--columns %s" % " ".join(column_names), file=error_file, flush=True) print("--trim=%s" % str(omit_remaining_columns), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: if verbose: print("Opening the input file %s" % str(input_kgtk_file), file=error_file, flush=True) kr = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) remaining_names: typing.List[str] = kr.column_names.copy() reordered_names: typing.List[str] = [] save_reordered_names: typing.Optional[typing.List[str]] = None ellipses: str = "..." # All unmentioned columns ranger: str = ".." # All columns between two columns. saw_ranger: bool = False column_name: str for column_name in column_names: if column_name == ellipses: if save_reordered_names is not None: raise KGTKException("Elipses may appear only once") if saw_ranger: raise KGTKException( "ELipses may not appear directly after a range operator ('..')." ) save_reordered_names = reordered_names reordered_names = [] continue if column_name == ranger: if len(reordered_names) == 0: raise KGTKException( "The column range operator ('..') may not appear without a preceeding column name." ) saw_ranger = True continue if column_name not in kr.column_names: raise KGTKException("Unknown column name '%s'." % column_name) if column_name not in remaining_names: raise KGTKException( "Column name '%s' was duplicated in the list." % column_name) if saw_ranger: saw_ranger = False prior_column_name: str = reordered_names[-1] prior_column_idx: int = kr.column_name_map[prior_column_name] column_name_idx: int = kr.column_name_map[column_name] start_idx: int end_idx: int idx_inc: int if column_name_idx > prior_column_idx: start_idx = prior_column_idx + 1 end_idx = column_name_idx - 1 idx_inc = 1 else: start_idx = prior_column_idx - 1 end_idx = column_name_idx + 1 idx_inc = -1 idx: int = start_idx while idx <= end_idx: idx_column_name: str = kr.column_names[idx] if idx_column_name not in remaining_names: raise KGTKException( "Column name '%s' (%s .. %s) was duplicated in the list." % (column_name, prior_column_name, column_name)) reordered_names.append(idx_column_name) remaining_names.remove(idx_column_name) idx += idx_inc reordered_names.append(column_name) remaining_names.remove(column_name) if saw_ranger: raise KGTKException( "The column ranger operator ('..') may not end the list of column names." ) if len(remaining_names) > 0 and save_reordered_names is None: # There are remaining column names and the ellipses was not seen. if not omit_remaining_columns: raise KGTKException( "No ellipses, and the following columns not accounted for: %s" % " ".join(remaining_names)) else: if verbose: print("Omitting the following columns: %s" % " ".join(remaining_names), file=error_file, flush=True) if save_reordered_names is not None: if len(remaining_names) > 0: save_reordered_names.extend(remaining_names) if len(reordered_names) > 0: save_reordered_names.extend(reordered_names) reordered_names = save_reordered_names if verbose: print("Opening the output file %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open( reordered_names, output_kgtk_file, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, mode=KgtkWriter.Mode[kr.mode.name], output_format=output_format, verbose=verbose, very_verbose=very_verbose, ) shuffle_list: typing.List = kw.build_shuffle_list(kr.column_names) input_data_lines: int = 0 row: typing.List[str] for row in kr: input_data_lines += 1 kw.write(row, shuffle_list=shuffle_list) # Flush the output file so far: kw.flush() if verbose: print("Read %d data lines from file %s" % (input_data_lines, input_kgtk_file), file=error_file, flush=True) kw.close() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, columns: typing.Optional[typing.List[str]] = None, locale: str = "C", reverse_sort: bool = False, reverse_columns: typing.Optional[typing.List[str]] = None, numeric_sort: bool = False, numeric_columns: typing.Optional[typing.List[str]] = None, pure_python: bool = False, extra: typing.Optional[str] = None, bash_command: str = "bash", bzip2_command: str = "bzip2", gzip_command: str = "gzip", pgrep_command: str = "pgrep", sort_command: str = "sort", xz_command: str = "xz", errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: from io import StringIO import os from pathlib import Path import sh # type: ignore import sys import typing from kgtk.cli_entry import progress_startup from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_path: Path = KGTKArgumentParser.get_input_file(input_file) output_path: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) def python_sort(): if numeric_columns is not None and len(numeric_columns) > 0: raise KGTKException( 'Error: the pure Python sorter does not currently support numeric column sorts.' ) if reverse_columns is not None and len(reverse_columns) > 0: raise KGTKException( 'Error: the pure Python sorter does not currently support reverse column sorts.' ) if verbose: print("Opening the input file: %s" % str(input_path), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_path, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) sort_idx: int key_idxs: typing.List[int] = [] if columns is not None and len(columns) > 0: # Process the list of column names, including splitting # comma-separated lists of column names. column_name: str for column_name in columns: column_name_2: str for column_name_2 in column_name.split(","): column_name_2 = column_name_2.strip() if len(column_name_2) == 0: continue if column_name_2.isdigit(): sort_idx = int(column_name_2) if sort_idx > len(kr.column_names): kr.close() cleanup() raise KGTKException( "Invalid column number %d (max %d)." % (sort_idx, len(kr.column_names))) key_idxs.append(sort_idx - 1) else: if column_name_2 not in kr.column_names: kr.close() cleanup() raise KGTKException("Unknown column_name %s" % column_name_2) key_idxs.append(kr.column_name_map[column_name_2]) else: if kr.is_node_file: key_idxs.append(kr.id_column_idx) elif kr.is_edge_file: if kr.id_column_idx >= 0: key_idxs.append(kr.id_column_idx) key_idxs.append(kr.node1_column_idx) key_idxs.append(kr.label_column_idx) key_idxs.append(kr.node2_column_idx) else: cleanup() raise KGTKException( "Unknown KGTK file mode, please specify the sorting columns." ) if verbose: print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]), file=error_file, flush=True) if numeric_sort and len(key_idxs) > 1: raise KGTKException( 'Error: the pure Python sorter does not currently support numeric sorts on multiple columns.' ) lines: typing.MutableMapping[typing.Union[str, float], typing.List[typing.List[str]]] = dict() progress_startup() key: typing.Union[str, float] row: typing.List[str] for row in kr: key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx] for idx in key_idxs) if numeric_sort: key = float(key) if key in lines: # There are multiple rows with the same key. Make this a stable sort. lines[key].append(row) else: lines[key] = [row] if verbose: print("\nRead %d data lines." % len(lines), file=error_file, flush=True) kw = KgtkWriter.open(kr.column_names, output_path, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) for key in sorted(lines.keys(), reverse=reverse_sort): for row in lines[key]: kw.write(row) kw.close() kr.close() if pure_python: return python_sort() try: global header_read_fd global header_write_fd header_read_fd, header_write_fd = os.pipe() os.set_inheritable(header_write_fd, True) if verbose: print("header pipe: read_fd=%d write_fd=%d" % (header_read_fd, header_write_fd), file=error_file, flush=True) global sortopt_read_fd global sortopt_write_fd sortopt_read_fd, sortopt_write_fd = os.pipe() os.set_inheritable(sortopt_read_fd, True) if verbose: print("sort options pipe: read_fd=%d write_fd=%d" % (sortopt_read_fd, sortopt_write_fd), file=error_file, flush=True) locale_envar: str = "LC_ALL=%s" % locale if len(locale) > 0 else "" # Note: "read -u n", used below, is not supported by some shells. # bash and zsh support it. # ash, csh, dash, and tcsh do not. # The original standard Bourne shell, sh, does not. # ksh might do it, if the FD number is a single digit. cmd: str = "".join(( "{ IFS= read -r header ; ", # Read the header line " { printf \"%s\\n\" \"$header\" >&" + str(header_write_fd) + " ; } ; ", # Send the header to Python " printf \"%s\\n\" \"$header\" ; ", # Send the header to standard output (which may be redirected to a file, below). " IFS= read -u " + str(sortopt_read_fd) + " -r options ; ", # Read the sort command options from Python. " %s %s -t '\t' $options ; } " % ( locale_envar, sort_command ), # Sort the remaining input lines using the options read from Python. )) if str(output_path) != "-": # Do we want to compress the output? output_suffix: str = output_path.suffix.lower() if output_suffix in [".gz", ".z"]: if verbose: print("gzip output file: %s" % repr(str(output_path)), file=error_file, flush=True) cmd += " | " + gzip_command + " -" elif output_suffix in [".bz2", ".bz"]: if verbose: print("bzip2 output file: %s" % repr(str(output_path)), file=error_file, flush=True) cmd += " | " + bzip2_command + " -z" elif output_suffix in [".xz", ".lzma"]: if verbose: print("xz output file: %s" % repr(str(output_path)), file=error_file, flush=True) cmd += " | " + xz_command + " -z -" # Feed the sorted output to the named file. Otherwise, the sorted # output goes to standard output without passing through Python. cmd += " > " + repr(str(output_path)) if verbose: print("sort command: %s" % cmd, file=error_file, flush=True) global cat_proc cat_proc = None global cmd_proc cmd_proc = None def cat_done(cmd, success, exit_code): # When the cat command finishes, monitor the progress of the sort command. if verbose: print("\nDone reading the input file", file=error_file, flush=True) if cmd_proc is None: return # Locate the sort command using pgrep buf = StringIO() try: sh_pgrep = sh.Command(pgrep_command) sh_pgrep("-g", cmd_proc.pgid, "--newest", sort_command, _out=buf) pgrep_output = buf.getvalue() if len(pgrep_output) == 0: if verbose: print("Unable to locate the sort command.", file=error_file, flush=True) return sort_pid = int(pgrep_output) except Exception as e: if verbose: print("Exception looking for sort command: %s" % str(e), file=error_file, flush=True) return finally: buf.close() if verbose: print("Monitoring the sort command (pid=%d)" % sort_pid, file=error_file, flush=True) progress_startup(pid=sort_pid) if str(input_path) == "-": # Read from standard input. # # Sh version 1.13 or greater is required for _pass_fds. sh_bash = sh.Command(bash_command) cmd_proc = sh_bash("-c", cmd, _in=sys.stdin, _out=sys.stdout, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _pass_fds={header_write_fd, sortopt_read_fd}) # It would be nice to monitor the sort command here. Unfortunately, there # is a race condition that makes this difficult. We could loop until the # sort command is created, then monitor it. else: # Feed the named file into the data processing pipeline, input_suffix: str = input_path.suffix.lower() if input_suffix in [".gz", ".z"]: if verbose: print("gunzip input file: %s" % repr(str(input_path)), file=error_file, flush=True) sh_gzip = sh.Command(gzip_command) cat_proc = sh_gzip(input_path, "-dc", _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: %s -dc %s | %s" % (gzip_command, repr(str(input_path)), cmd), file=error_file, flush=True) elif input_suffix in [".bz2", ".bz"]: if verbose: print("bunzip2 input file: %s" % repr(str(input_path)), file=error_file, flush=True) sh_bzip2 = sh.Command(bzip2_command) cat_proc = sh_bzip2(input_path, "-dc", _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: %s -dc %s | %s" % (bzip2_command, repr(str(input_path)), cmd), file=error_file, flush=True) elif input_suffix in [".xz", ".lzma"]: if verbose: print("unxz input file: %s" % repr(str(input_path)), file=error_file, flush=True) sh_xz = sh.Command(xz_command) cat_proc = sh_xz(input_path, "-dc", _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: %s -dc %s | %s" % (xz_command, repr(str(input_path)), cmd), file=error_file, flush=True) else: if verbose: print("input file: %s" % repr(str(input_path)), file=error_file, flush=True) cat_proc = sh.cat(input_path, _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: cat %s | %s" % (repr(str(input_path)), cmd), file=error_file, flush=True) # If enabled, monitor the progress of reading the input file. # Since we do not have access to the pid of the sort command, # we cannot monitor the progress of the merge phases. if verbose: print("Monitoring the cat command (pid=%d)." % cat_proc.pid, file=error_file, flush=True) progress_startup(pid=cat_proc.pid) # Sh version 1.13 or greater is required for _pass_fds. sh_bash = sh.Command(bash_command) cmd_proc = sh_bash(cat_proc, "-c", cmd, _out=sys.stdout, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _pass_fds={header_write_fd, sortopt_read_fd}) # Since we do not have access to the pid of the sort command, # we cannot monitor the progress of the merge phases. if verbose: print("Running the sort script (pid=%d)." % cmd_proc.pid, file=error_file, flush=True) if verbose: print("Reading the KGTK input file header line with KgtkReader", file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( Path("<%d" % header_read_fd), options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) if verbose: print("KGTK header: %s" % " ".join(kr.column_names), file=error_file, flush=True) sort_options: str = "" if reverse_sort: sort_options += " --reverse" if numeric_sort: sort_options += " --numeric" if extra is not None and len(extra) > 0: sort_options += " " + extra # We will consume entries in reverse_columns and numeric_columns, # then complain if any are left over. if reverse_columns is not None: reverse_columns = reverse_columns[:] # Protect against modifying a shared list. if numeric_columns is not None: numeric_columns = numeric_columns[:] # Protect against modifying a shared list. column_name: str sort_idx: int if columns is not None and len(columns) > 0: # Process the list of column names, including splitting # comma-separated lists of column names. for column_name in columns: column_name_2: str for column_name_2 in column_name.split(","): column_name_2 = column_name_2.strip() if len(column_name_2) == 0: continue if column_name_2.isdigit(): sort_idx = int(column_name_2) if sort_idx > len(kr.column_names): kr.close() cleanup() raise KGTKException( "Invalid column number %d (max %d)." % (sort_idx, len(kr.column_names))) else: if column_name_2 not in kr.column_names: kr.close() cleanup() raise KGTKException("Unknown column_name %s" % repr(column_name_2)) sort_idx = kr.column_name_map[column_name_2] + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) if reverse_columns is not None and column_name_2 in reverse_columns: sort_options += "r" reverse_columns.remove(column_name_2) if numeric_columns is not None and column_name_2 in numeric_columns: sort_options += "n" numeric_columns.remove(column_name_2) else: # TODO: support the case where the column name in reverse_columns # or numeric_columns is an alias of the name used in the file header. if kr.is_node_file: sort_idx = kr.id_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.id_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) elif kr.is_edge_file: if kr.id_column_idx >= 0: sort_idx = kr.id_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.id_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) sort_idx = kr.node1_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.node1_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) sort_idx = kr.label_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.label_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) sort_idx = kr.node2_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.node2_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: numeric_columns.remove(column_name) sort_options += "n" else: cleanup() raise KGTKException( "Unknown KGTK file mode, please specify the sorting columns." ) # Check for unconsumed entries in reverse_columns and numeric_columns: if reverse_columns is not None and len(reverse_columns) > 0: raise KGTKException("Unknown reverse column(s) %s" % " ".join( [repr(column_name) for column_name in reverse_columns])) if numeric_columns is not None and len(numeric_columns) > 0: raise KGTKException("Unknown numeric column(s) %s" % " ".join( [repr(column_name) for column_name in numeric_columns])) if verbose: print("sort options: %s" % sort_options, file=error_file, flush=True) kr.close() # We are done with the KgtkReader now. # Send the sort options back to the data processing pipeline. with open(sortopt_write_fd, "w") as options_file: options_file.write(sort_options + "\n") if verbose: print("\nWaiting for the sort command to complete.\n", file=error_file, flush=True) cmd_proc.wait() if verbose: print("Cleanup.", file=error_file, flush=True) cleanup() return 0 except Exception as e: # import traceback # traceback.print_tb(sys.exc_info()[2], 10) raise KGTKException('INTERNAL ERROR: ' + type(e).__module__ + '.' + str(e) + '\n')
def run(input_file: KGTKFiles, output_file: KGTKFiles, verbose: bool = False, very_verbose: bool = False, **kwargs): """ **kwargs stores all parameters providing by user """ # print(kwargs) # import modules locally import sys import typing import os import logging from pathlib import Path import json, os, h5py, gzip, torch, shutil from torchbiggraph.config import parse_config from kgtk.exceptions import KGTKException # copy missing file under kgtk/graph_embeddings from kgtk.templates.kgtkcopytemplate import KgtkCopyTemplate from kgtk.graph_embeddings.importers import TSVEdgelistReader, convert_input_data from torchbiggraph.train import train from torchbiggraph.util import SubprocessInitializer, setup_logging from kgtk.graph_embeddings.export_to_tsv import make_tsv # from torchbiggraph.converters.export_to_tsv import make_tsv try: input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) # store the data into log file, then the console will not output anything if kwargs['log_file_path'] != None: log_file_path = kwargs['log_file_path'] logging.basicConfig( format='%(asctime)s - %(filename)s[line:%(lineno)d] \ - %(levelname)s: %(message)s', level=logging.DEBUG, filename=str(log_file_path), filemode='w') print( f'In Processing, Please go to {kwargs["log_file_path"]} to check details', file=sys.stderr, flush=True) tmp_folder = kwargs['temporary_directory'] tmp_tsv_path: Path = tmp_folder / f'tmp_{input_kgtk_file.name}' # tmp_tsv_path:Path = input_kgtk_file.parent/f'tmp_{input_kgtk_file.name}' # make sure the tmp folder exists, otherwise it will raise an exception if not os.path.exists(tmp_folder): os.makedirs(tmp_folder) try: #if output_kgtk_file is not empty, delete it output_kgtk_file.unlink() except: pass # didn't find, then let it go # ********************************************* # 0. PREPARE PBG TSV FILE # ********************************************* reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) error_file: typing.TextIO = sys.stdout if kwargs.get( "errors_to_stdout") else sys.stderr kct: KgtkCopyTemplate = KgtkCreateTmpTsv( input_file_path=input_kgtk_file, output_file_path=tmp_tsv_path, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) # prepare the graph file # create a tmp tsv file for PBG embedding logging.info('Generate the valid tsv format for embedding ...') kct.process() logging.info('Embedding file is ready...') # ********************************************* # 1. DEFINE CONFIG # ********************************************* raw_config = get_config(**kwargs) ## setting corresponding learning rate and loss function for different algorthim processed_config = config_preprocess(raw_config) # temporry output folder tmp_output_folder = Path(processed_config['entity_path']) # before moving, need to check whether the tmp folder is not empty in case of bug try: #if temporry output folder is alrady existing then delete it shutil.rmtree(tmp_output_folder) except: pass # didn't find, then let it go # ************************************************** # 2. TRANSFORM GRAPH TO A BIGGRAPH-FRIENDLY FORMAT # ************************************************** setup_logging() config = parse_config(processed_config) subprocess_init = SubprocessInitializer() input_edge_paths = [tmp_tsv_path] convert_input_data( config.entities, config.relations, config.entity_path, config.edge_paths, input_edge_paths, TSVEdgelistReader(lhs_col=0, rel_col=1, rhs_col=2), dynamic_relations=config.dynamic_relations, ) # ************************************************ # 3. TRAIN THE EMBEDDINGS #************************************************* train(config, subprocess_init=subprocess_init) # ************************************************ # 4. GENERATE THE OUTPUT # ************************************************ # entities_output = output_kgtk_file entities_output = tmp_output_folder / 'entities_output.tsv' relation_types_output = tmp_output_folder / 'relation_types_tf.tsv' with open(entities_output, "xt") as entities_tf, open(relation_types_output, "xt") as relation_types_tf: make_tsv(config, entities_tf, relation_types_tf) # output correct format for embeddings if kwargs['output_format'] == 'glove': # glove format output shutil.copyfile(entities_output, output_kgtk_file) elif kwargs['output_format'] == 'w2v': # w2v format output generate_w2v_output(entities_output, output_kgtk_file, kwargs) else: # write to the kgtk output format tsv generate_kgtk_output(entities_output, output_kgtk_file, kwargs.get('output_no_header', False), verbose, very_verbose) logging.info(f'Embeddings has been generated in {output_kgtk_file}.') # ************************************************ # 5. Garbage collection # ************************************************ if kwargs['retain_temporary_data'] == False: shutil.rmtree(kwargs['temporary_directory']) # tmp_tsv_path.unlink() # delete temporay tsv file # shutil.rmtree(tmp_output_folder) # deleter temporay output folder if kwargs["log_file_path"] != None: print('Processed Finished.', file=sys.stderr, flush=True) logging.info( f"Process Finished.\nOutput has been saved in {repr(str(output_kgtk_file))}" ) else: print( f"Process Finished.\nOutput has been saved in {repr(str(output_kgtk_file))}", file=sys.stderr, flush=True) except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, undirected: bool, compute_degrees: bool, compute_pagerank: bool, compute_hits: bool, log_file: str, statistics_only: bool, vertex_in_degree: str, vertex_out_degree: str, vertex_pagerank: str, vertex_auth: str, vertex_hubs: str, top_n: int, errors_to_stdout: bool, errors_to_stderr: bool, show_options: bool, verbose: bool, very_verbose: bool, **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. ): # import modules locally from pathlib import Path import sys from graph_tool import centrality from kgtk.exceptions import KGTKException import kgtk.gt.analysis_utils as gtanalysis from kgtk.gt.gt_load import load_graph_from_kgtk from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions v_prop_dict = { 'vertex_pagerank': vertex_pagerank, 'vertex_hubs': vertex_hubs, 'vertex_auth': vertex_auth } try: # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later directions = ['in', 'out', 'total'] id_col = 'name' output_columns = ["node1", "label", "node2", "id"] if verbose: print('loading the KGTK input file...\n', file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) sub: int = kr.get_node1_column_index() if sub < 0: print("Missing node1 (subject) column.", file=error_file, flush=True) pred: int = kr.get_label_column_index() if pred < 0: print("Missing label (predicate) column.", file=error_file, flush=True) obj: int = kr.get_node2_column_index() if obj < 0: print("Missing node2 (object) column", file=error_file, flush=True) if sub < 0 or pred < 0 or obj < 0: kr.close() raise KGTKException("Exiting due to missing columns.") predicate: str = kr.column_names[pred] G2 = load_graph_from_kgtk(kr, directed=not undirected, ecols=(sub, obj), verbose=verbose, out=error_file) if verbose: print('graph loaded! It has %d nodes and %d edges.' % (G2.num_vertices(), G2.num_edges()), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_columns, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, verbose=verbose, very_verbose=very_verbose) with open(log_file, 'w') as writer: writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) writer.write('\n###Top relations:\n') for rel, freq in gtanalysis.get_topN_relations( G2, pred_property=predicate): writer.write('%s\t%d\n' % (rel, freq)) if compute_degrees: writer.write('\n###Degrees:\n') for direction in directions: degree_data = gtanalysis.compute_node_degree_hist( G2, direction) max_degree = len(degree_data) - 1 mean_degree, std_degree = gtanalysis.compute_avg_node_degree( G2, direction) writer.write( '%s degree stats: mean=%f, std=%f, max=%d\n' % (direction, mean_degree, std_degree, max_degree)) if compute_pagerank: writer.write('\n###PageRank\n') v_pr = G2.new_vertex_property('float') centrality.pagerank(G2, prop=v_pr) G2.properties[('v', 'vertex_pagerank')] = v_pr writer.write('Max pageranks\n') result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank', top_n, id_col) for n_id, n_label, pr in result: writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr)) if compute_hits: writer.write('\n###HITS\n') hits_eig, G2.vp['vertex_hubs'], G2.vp[ 'vertex_auth'] = gtanalysis.compute_hits(G2) writer.write('HITS hubs\n') main_hubs = gtanalysis.get_topn_indices( G2, 'vertex_hubs', top_n, id_col) for n_id, n_label, hubness in main_hubs: writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness)) writer.write('HITS auth\n') main_auth = gtanalysis.get_topn_indices( G2, 'vertex_auth', top_n, id_col) for n_id, n_label, authority in main_auth: writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) id_count = 0 if not statistics_only: for e in G2.edges(): sid, oid = e lbl = G2.ep[predicate][e] kw.write([ G2.vp[id_col][sid], lbl, G2.vp[id_col][oid], '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count) ]) id_count += 1 id_count = 0 for v in G2.vertices(): v_id = G2.vp[id_col][v] kw.write([ v_id, vertex_in_degree, str(v.in_degree()), '{}-{}-{}'.format(v_id, vertex_in_degree, id_count) ]) id_count += 1 kw.write([ v_id, vertex_out_degree, str(v.out_degree()), '{}-{}-{}'.format(v_id, vertex_out_degree, id_count) ]) id_count += 1 for vprop in G2.vertex_properties.keys(): if vprop == id_col: continue kw.write([ v_id, v_prop_dict[vprop], str(G2.vp[vprop][v]), '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count) ]) id_count += 1 kw.close() kr.close() except Exception as e: raise KGTKException('Error: ' + str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, columns: typing.Optional[typing.List[str]], split_on_commas: bool, split_on_spaces: bool, strip_spaces: bool, all_except: bool, ignore_missing_columns: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if columns is not None: print("--columns=%s" % " ".join(columns), file=error_file) print("--split-on-commas=%s" % str(split_on_commas), file=error_file) print("--split-on-spaces=%s" % str(split_on_spaces), file=error_file) print("--strip-spaces=%s" % str(strip_spaces), file=error_file) print("--all-except=%s" % str(all_except), file=error_file) print("--ignore-missing-columns=%s" % str(ignore_missing_columns), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: if columns is None: columns = [] # This simplifies matters. if split_on_spaces: # We will be very lenient, and allow space-seperated arguments # *inside* shell quoting, e.g. # # kgtk remove_columns -c 'name name2 name3' # # Do not enable this option if spaces are legal inside your # column names. columns = " ".join(columns).split() remove_columns: typing.List[str] = [] arg: str column_name: str for arg in columns: if split_on_commas: for column_name in arg.split(","): if strip_spaces: column_name = column_name.strip() if len(column_name) > 0: remove_columns.append(column_name) else: if strip_spaces: arg = arg.strip() if len(arg) > 0: remove_columns.append(arg) if verbose: if all_except: print("Removing all columns except %d columns: %s" % (len(remove_columns), " ".join(remove_columns)), file=error_file, flush=True) else: print("Removing %d columns: %s" % (len(remove_columns), " ".join(remove_columns)), file=error_file, flush=True) if len(remove_columns) == 0: raise KGTKException("No columns to remove") if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) output_column_names: typing.List[str] trouble_column_names: typing.List[str] = [] if all_except: if not ignore_missing_columns: for column_name in remove_columns: if column_name not in kr.column_names: print("Error: cannot retain unknown column '%s'." % column_name, file=error_file, flush=True) trouble_column_names.append(column_name) output_column_names = [] for column_name in kr.column_names: if column_name in remove_columns: output_column_names.append(column_name) else: output_column_names = kr.column_names.copy() for column_name in remove_columns: if column_name in output_column_names: output_column_names.remove(column_name) elif not ignore_missing_columns: print("Error: cannot remove unknown column '%s'." % column_name, file=error_file, flush=True) trouble_column_names.append(column_name) if len(trouble_column_names) > 0: raise KGTKException("Unknown columns %s" % " ".join(trouble_column_names)) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names) input_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 kw.write(row, shuffle_list=shuffle_list) if verbose: print("Processed %d rows." % (input_line_count), file=error_file, flush=True) kw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run( input_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, namespace_file: KGTKFiles, updated_namespace_file: KGTKFiles, namespace_id_prefix: str, namespace_id_use_uuid: bool, namespace_id_counter: int, namespace_id_zfill: int, output_only_used_namespaces: bool, allow_lax_uri: bool, local_namespace_prefix: str, local_namespace_use_uuid: bool, prefix_expansion_label: str, structured_value_label: str, structured_uri_label: str, newnode_prefix: str, newnode_use_uuid: bool, newnode_counter: int, newnode_zfill: int, build_id: bool, escape_pipes: bool, validate: bool, override_uuid: typing.Optional[str], errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.imports.kgtkntriples import KgtkNtriples from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions # Select where to send error messages, defaulting to stderr. input_file_paths: typing.List[ Path] = KGTKArgumentParser.get_input_file_list(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reject_file_path: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( reject_file, who="KGTK reject file") namespace_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_input_file( namespace_file, who="KGTK namespace file") updated_namespace_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( updated_namespace_file, who="KGTK updated namespace file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-files %s" % " ".join([str(path) for path in input_file_paths]), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if reject_file_path is not None: print("--reject-file=%s" % str(reject_file_path), file=error_file, flush=True) if namespace_kgtk_file is not None: print("--namespace-file=%s" % str(namespace_kgtk_file), file=error_file, flush=True) if updated_namespace_kgtk_file is not None: print("--updated-namespace-file=%s" % str(updated_namespace_kgtk_file), file=error_file, flush=True) print("--namespace-id-prefix %s" % namespace_id_prefix, file=error_file, flush=True) print("--namespace-id-use-uuid %s" % str(namespace_id_use_uuid), file=error_file, flush=True) print("--namespace-id-counter %s" % str(namespace_id_counter), file=error_file, flush=True) print("--namespace-id-zfill %s" % str(namespace_id_zfill), file=error_file, flush=True) print("--output-only-used-namespaces %s" % str(output_only_used_namespaces), file=error_file, flush=True) print("--allow-lax-uri %s" % str(allow_lax_uri), file=error_file, flush=True) print("--local-namespace-prefix %s" % local_namespace_prefix, file=error_file, flush=True) print("--local-namespace-use-uuid %s" % str(local_namespace_use_uuid), file=error_file, flush=True) print("--prefix-expansion-label %s" % prefix_expansion_label, file=error_file, flush=True) print("--structured-value-label %s" % structured_value_label, file=error_file, flush=True) print("--structured-uri-label %s" % structured_uri_label, file=error_file, flush=True) print("--newnode-prefix %s" % newnode_prefix, file=error_file, flush=True) print("--newnode-use-uuid %s" % str(newnode_use_uuid), file=error_file, flush=True) print("--newnode-counter %s" % str(newnode_counter), file=error_file, flush=True) print("--newnode-zfill %s" % str(newnode_zfill), file=error_file, flush=True) print("--build-id=%s" % str(build_id), file=error_file, flush=True) print("--escape-pipes=%s" % str(escape_pipes), file=error_file, flush=True) print("--validate=%s" % str(validate), file=error_file, flush=True) print("--override-uuid=%s" % str(override_uuid), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: kn: KgtkNtriples = KgtkNtriples( input_file_paths=input_file_paths, output_file_path=output_kgtk_file, reject_file_path=reject_file_path, updated_namespace_file_path=updated_namespace_kgtk_file, namespace_file_path=namespace_kgtk_file, namespace_id_prefix=namespace_id_prefix, namespace_id_use_uuid=namespace_id_use_uuid, namespace_id_counter=namespace_id_counter, namespace_id_zfill=namespace_id_zfill, output_only_used_namespaces=output_only_used_namespaces, newnode_prefix=newnode_prefix, newnode_use_uuid=newnode_use_uuid, newnode_counter=newnode_counter, newnode_zfill=newnode_zfill, allow_lax_uri=allow_lax_uri, local_namespace_prefix=local_namespace_prefix, local_namespace_use_uuid=local_namespace_use_uuid, prefix_expansion_label=prefix_expansion_label, structured_value_label=structured_value_label, structured_uri_label=structured_uri_label, build_id=build_id, escape_pipes=escape_pipes, validate=validate, override_uuid=override_uuid, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) kn.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, list_output_file: KGTKFiles, key_column_names: typing.List[str], keep_first_names: typing.List[str], compact_id: bool, deduplicate: bool, sorted_input: bool, verify_sort: bool, lists_in_input: bool, report_lists: bool, exclude_lists: bool, output_only_lists: bool, build_id: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkcompact import KgtkCompact from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) list_output_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( list_output_file, who="KGTK list output file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if list_output_kgtk_file is not None: print("--list-output-file=%s" % str(list_output_kgtk_file), file=error_file, flush=True) print("--columns=%s" % " ".join(key_column_names), file=error_file) print("--keep-first=%s" % " ".join(keep_first_names), file=error_file) print("--compact-id=%s" % str(compact_id), file=error_file, flush=True) print("--deduplicate=%s" % str(deduplicate), file=error_file, flush=True) print("--presorted=%s" % str(sorted_input), file=error_file, flush=True) print("--verify-sort=%s" % str(verify_sort), file=error_file, flush=True) print("--lists-in-input=%s" % str(lists_in_input), file=error_file, flush=True) print("--report-lists=%s" % str(report_lists), file=error_file, flush=True) print("--exclude-lists=%s" % str(exclude_lists), file=error_file, flush=True) print("--output-only-lists=%s" % str(output_only_lists), file=error_file, flush=True) print("--build-id=%s" % str(build_id), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) KgtkReader.show_debug_arguments(errors_to_stdout=errors_to_stdout, errors_to_stderr=errors_to_stderr, show_options=show_options, verbose=verbose, very_verbose=very_verbose, out=error_file) print("=======", file=error_file, flush=True) if exclude_lists and output_only_lists: raise KGTKException( "--exclude-lists and --output-only-lists may not be used together." ) try: ex: KgtkCompact = KgtkCompact( input_file_path=input_kgtk_file, output_file_path=output_kgtk_file, list_output_file_path=list_output_kgtk_file, key_column_names=key_column_names, keep_first_names=keep_first_names, compact_id=compact_id, deduplicate=deduplicate, sorted_input=sorted_input, verify_sort=verify_sort, lists_in_input=lists_in_input, report_lists=report_lists, exclude_lists=exclude_lists, output_only_lists=output_only_lists, build_id=build_id, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) ex.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, path_file: KGTKFiles, output_file: KGTKFiles, statistics_only: bool, undirected: bool, max_hops: int, source_column_name: typing.Optional[str], target_column_name: typing.Optional[str], shortest_path: bool, errors_to_stdout: bool, errors_to_stderr: bool, show_options: bool, verbose: bool, very_verbose: bool, **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. ): # import modules locally from pathlib import Path import sys from graph_tool.all import find_vertex from graph_tool.topology import all_paths from graph_tool.topology import all_shortest_paths from kgtk.gt.gt_load import load_graph_from_kgtk from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions from kgtk.exceptions import KGTKException try: # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="input", fallback=True) path_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="path", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) path_kgtk_file: Path = KGTKArgumentParser.get_input_file(path_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) id_col = 'name' if verbose: print("Reading the path file: %s" % str(path_kgtk_file), file=error_file, flush=True) pairs = [] pkr: KgtkReader = KgtkReader.open( path_kgtk_file, error_file=error_file, options=path_reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) path_source_idx: int = pkr.get_node1_column_index(source_column_name) if path_source_idx < 0: print("Missing node1 (source) column name in the path file.", file=error_file, flush=True) path_target_idx: int = pkr.get_node2_column_index(target_column_name) if path_target_idx < 0: print("Missing node1 (target) column name in the path file.", file=error_file, flush=True) if path_source_idx < 0 or path_target_idx < 0: pkr.close() raise KGTKException("Exiting due to missing columns.") paths_read: int = 0 path_row: typing.List[str] for path_row in pkr: paths_read += 1 if len(path_row) != pkr.column_count: raise KGTKException( "Exiting because line %d in the path file (%s) is the wrong length: %d columns expected, %d were read." % (paths_read, str(path_kgtk_file), pkr.column_count, len(path_row))) src: str = path_row[path_source_idx] tgt: str = path_row[path_target_idx] pairs.append((src, tgt)) pkr.close() if verbose: print("%d path rows read" % paths_read, file=error_file, flush=True) if len(pairs) == 0: print("No path pairs found, the output will be empty.", file=error_file, flush=True) elif verbose: print("%d path pairs found" % len(pairs), file=error_file, flush=True) if verbose: print("Reading the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=input_reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) sub_index: int = kr.get_node1_column_index() if sub_index < 0: print("Missing node1 (subject) column.", file=error_file, flush=True) pred_index: int = kr.get_label_column_index() if pred_index < 0: print("Missing label (predicate) column.", file=error_file, flush=True) obj_index: int = kr.get_node2_column_index() if obj_index < 0: print("Missing node2 (object) column", file=error_file, flush=True) id_index: int = kr.get_id_column_index() if id_index < 0: print("Missing id column", file=error_file, flush=True) if sub_index < 0 or pred_index < 0 or obj_index < 0 or id_index < 0: kr.close() raise KGTKException("Exiting due to missing columns.") predicate: str = kr.column_names[pred_index] id_col_name: str = kr.column_names[id_index] G = load_graph_from_kgtk(kr, directed=not undirected, ecols=(sub_index, obj_index), verbose=verbose, out=error_file) output_columns: typing.List[str] = ['node1', 'label', 'node2', 'id'] kw: KgtkWriter = KgtkWriter.open(output_columns, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, verbose=verbose, very_verbose=very_verbose) id_count = 0 if not statistics_only: for e in G.edges(): sid, oid = e lbl = G.ep[predicate][e] kw.write([ G.vp[id_col][sid], lbl, G.vp[id_col][oid], '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count) ]) id_count += 1 if verbose: print("%d edges found." % id_count, file=error_file, flush=True) id_count = 0 path_id = 0 for pair in pairs: source_node, target_node = pair source_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=source_node) target_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=target_node) if len(source_ids) == 1 and len(target_ids) == 1: source_id = source_ids[0] target_id = target_ids[0] if shortest_path: _all_paths = all_shortest_paths(G, source_id, target_id, edges=True) else: _all_paths = all_paths(G, source_id, target_id, cutoff=max_hops, edges=True) for path in _all_paths: for edge_num, an_edge in enumerate(path): edge_id = G.properties[('e', 'id')][an_edge] node1: str = 'p%d' % path_id kw.write([ node1, str(edge_num), edge_id, '{}-{}-{}'.format(node1, edge_num, id_count) ]) id_count += 1 path_id += 1 if verbose: print("%d paths contining %d edges found." % (path_id, id_count), file=error_file, flush=True) kw.close() kr.close() except Exception as e: raise KGTKException('Error: ' + str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, columns: typing.Optional[typing.List[str]] = None, labels: typing.Optional[typing.List[str]] = None, id_column_name: typing.Optional[str] = None, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally import os from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if columns is not None: print("--columns=%s" % " ".join(columns), file=error_file) if labels is not None: print("--labels=%s" % " ".join(labels), file=error_file) if id_column_name is not None: print("--id-column=%s" % id_column_name, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if verbose: print("Starting normalize_nodes pid=%d" % (os.getpid()), file=error_file, flush=True) label_map: typing.MutableMapping[str, str] = dict() if labels is not None and len(labels) > 0: if columns is None: raise KGTKException( "--columns must be supplied when --labels is used.") if len(columns) != len(labels): raise KGTKException("%d columns were supplied, but %d labels." % (len(columns), len(labels))) idx: int label: str for idx, label in enumerate(labels): label_map[columns[idx]] = label try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) id_column_idx: int = kr.get_id_column_index(id_column_name) if id_column_idx < 0: raise KGTKException("Unknown ID column %s" % repr(id_column_name)) output_column_names: typing.List[str] = [ KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2 ] if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) input_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 node1_value: str = row[id_column_idx] column_idx: int column_name: str for column_idx, column_name in enumerate(kr.column_names): if column_idx == id_column_idx: continue if columns is not None and column_name not in columns: continue label_value: str = label_map.get(column_name, column_name) new_value: str = row[column_idx] if len(new_value) == 0: continue # ignore empty values. # The column value might contain a KGTK list. Since node2 isn't supposed # to contain lists, we'll split it. node2_value: str for node2_value in KgtkValue.split_list(new_value): if len(node2_value) == 0: continue # node2 shouldn't contain empty values output_row: typing.List[str] = [ node1_value, label_value, node2_value ] kw.write(output_row) output_line_count += 1 if verbose: print("Read %d node rows, wrote %d edge rows." % (input_line_count, output_line_count), file=error_file, flush=True) kw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run( input_file: KGTKFiles, output_file: KGTKFiles, reified_file: KGTKFiles, unreified_file: KGTKFiles, uninvolved_file: KGTKFiles, trigger_label_value: str, trigger_node2_value: str, value_label_value: str, old_label_value: str, new_label_value: typing.Optional[str], allow_multiple_values: bool, allow_extra_columns: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.unreify.kgtkunreifyvalues import KgtkUnreifyValues from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) reified_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( reified_file, who="KGTK reified file") unreified_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( unreified_file, who="KGTK unreified file") uninvolved_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file( uninvolved_file, who="KGTK uninvolved file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-files %s" % str(input_kgtk_file), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if reified_kgtk_file is not None: print("--reified-file=%s" % str(reified_kgtk_file), file=error_file, flush=True) if unreified_kgtk_file is not None: print("--unreified-file=%s" % str(unreified_kgtk_file), file=error_file, flush=True) if uninvolved_kgtk_file is not None: print("--uninvolved-file=%s" % str(uninvolved_kgtk_file), file=error_file, flush=True) print("--trigger-label=%s" % trigger_label_value, file=error_file, flush=True) print("--trigger-node2=%s" % trigger_node2_value, file=error_file, flush=True) print("--value-label=%s" % value_label_value, file=error_file, flush=True) print("--old-label=%s" % old_label_value, file=error_file, flush=True) if new_label_value is not None: print("--new-label=%s" % new_label_value, file=error_file, flush=True) print("--allow-multiple-values=%s" % str(allow_multiple_values), file=error_file, flush=True) print("--allow-extra-columns=%s" % str(allow_extra_columns), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: kuv: KgtkUnreifyValues = KgtkUnreifyValues( input_file_path=input_kgtk_file, output_file_path=output_kgtk_file, reified_file_path=reified_kgtk_file, unreified_file_path=unreified_kgtk_file, uninvolved_file_path=uninvolved_kgtk_file, trigger_label_value=trigger_label_value, trigger_node2_value=trigger_node2_value, value_label_value=value_label_value, old_label_value=old_label_value, new_label_value=new_label_value, allow_multiple_values=allow_multiple_values, allow_extra_columns=allow_extra_columns, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) kuv.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run(input_files: KGTKFiles, output_file: KGTKFiles, output_format: typing.Optional[str], output_column_names: typing.Optional[typing.List[str]], old_column_names: typing.Optional[typing.List[str]], new_column_names: typing.Optional[typing.List[str]], errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally from kgtk.exceptions import KGTKException from kgtk.join.kgtkcat import KgtkCat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_file_paths: typing.List[Path] = KGTKArgumentParser.get_input_file_list(input_files) output_file_path: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # print("cat running", file=error_file, flush=True) # *** # TODO: check that at most one input file is stdin? # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-files %s" % " ".join((str(input_file_path) for input_file_path in input_file_paths)), file=error_file, flush=True) print("--output-file=%s" % str(output_file_path), file=error_file, flush=True) if output_format is not None: print("--output-format=%s" % output_format, file=error_file, flush=True) if output_column_names is not None: print("--output-coloumns %s" % " ".join(output_column_names), file=error_file, flush=True) if old_column_names is not None: print("--old-columns %s" % " ".join(old_column_names), file=error_file, flush=True) if new_column_names is not None: print("--new-columns %s" % " ".join(new_column_names), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) # Check for comsistent options. argparse doesn't support this yet. if output_column_names is not None and len(output_column_names) > 0: if (old_column_names is not None and len(old_column_names) > 0) or \ (new_column_names is not None and len(new_column_names) > 0): raise KGTKException("When --output-columns is used, --old-columns and --new-columns may not be used.") elif (old_column_names is not None and len(old_column_names) > 0) ^ \ (new_column_names is not None and len(new_column_names) > 0): raise KGTKException("Both --old-columns and --new-columns must be used when either is used.") elif (old_column_names is not None and len(old_column_names) > 0) and \ (new_column_names is not None and len(new_column_names) > 0): if len(old_column_names) != len(new_column_names): raise KGTKException("Both --old-columns and --new-columns must have the same number of columns.") try: kc: KgtkCat = KgtkCat(input_file_paths=input_file_paths, output_path=output_file_path, output_format=output_format, output_column_names=output_column_names, old_column_names=old_column_names, new_column_names=new_column_names, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose ) kc.process() # print("cat done", file=error_file, flush=True) # *** return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( left_file: KGTKFiles, right_file: KGTKFiles, output_file: KGTKFiles, join_on_id: bool = False, join_on_label: bool = False, join_on_node2: bool = False, left_prefix: typing.Optional[str] = None, left_join_columns: typing.Optional[typing.List[str]] = None, left_join: bool = False, right_prefix: typing.Optional[str] = None, right_join_columns: typing.Optional[typing.List[str]] = None, right_join: bool = False, field_separator: typing.Optional[str] = None, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.join.kgtkjoiner import KgtkJoiner from kgtk.value.kgtkvalueoptions import KgtkValueOptions left_file_path: Path = KGTKArgumentParser.get_input_file( left_file, who="KGTK left file") right_file_path: Path = KGTKArgumentParser.get_input_file( right_file, who="KGTK right file") output_file_path: Path = KGTKArgumentParser.get_output_file(output_file) field_separator = KgtkJoiner.FIELD_SEPARATOR_DEFAULT if field_separator is None else field_separator # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr if not right_join: if str(left_file_path) == "-": print( "The left file may not be stdin when an inner join or left join is requested.", file=error_file, flush=True) return 1 if not left_join: if str(right_file_path) == "-": print( "The right file may not be stdin when an inner join or right join is requested.", file=error_file, flush=True) return 1 if str(left_file_path) == "-" and str(right_file_path) == "-": print("The left and right files may not both be stdin.", file=error_file, flush=True) return 1 # Build the option structures. left_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="left", fallback=True) right_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="right", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: # TODO: left_file_path, right_file_path, --join-on-label, etc. print("--left-file=%s" % str(left_file_path), file=error_file) print("--right-file=%s" % str(right_file_path), file=error_file) print("--output-file=%s" % str(output_file_path), file=error_file) print("--left-join=%s" % str(left_join), file=error_file) print("--right-join=%s" % str(right_join), file=error_file) print("--join-on-id=%s" % str(join_on_id), file=error_file) print("--join-on-label=%s" % str(join_on_label), file=error_file) print("--join-on-node2=%s" % str(join_on_node2), file=error_file) if left_join_columns is not None: print("--left-join-columns=%s" % " ".join(left_join_columns), file=error_file) if right_join_columns is not None: print("--right-join-columns=%s" % " ".join(right_join_columns), file=error_file) if left_prefix is not None: print("--left-prefix=%s" % str(left_prefix), file=error_file) if right_prefix is not None: print("--right-prefix=%s" % str(right_prefix), file=error_file) print("--field-separator=%s" % repr(field_separator), file=error_file) left_reader_options.show(out=error_file, who="left") right_reader_options.show(out=error_file, who="right") value_options.show(out=error_file) try: kr: KgtkJoiner = KgtkJoiner( left_file_path=left_file_path, right_file_path=right_file_path, output_path=output_file_path, left_join=left_join, right_join=right_join, join_on_id=join_on_id, join_on_label=join_on_label, join_on_node2=join_on_node2, left_join_columns=left_join_columns, right_join_columns=right_join_columns, left_prefix=left_prefix, right_prefix=right_prefix, field_separator=field_separator, left_reader_options=left_reader_options, right_reader_options=right_reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) kr.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, key_column_names: typing.List[str], compact_id: bool, sorted_input: bool, verify_sort: bool, build_id: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from kgtk.exceptions import KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) print("--columns=%s" % " ".join(key_column_names), file=error_file) print("--compact-id=%s" % str(compact_id), file=error_file, flush=True) print("--presorted=%s" % str(sorted_input)) print("--verify-sort=%s" % str(verify_sort), file=error_file, flush=True) print("--build-id=%s" % str(build_id), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: ex: KgtkCompact = KgtkCompact( input_file_path=input_kgtk_file, key_column_names=key_column_names, compact_id=compact_id, sorted_input=sorted_input, verify_sort=verify_sort, output_file_path=output_kgtk_file, build_id=build_id, idbuilder_options=idbuilder_options, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) ex.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( node_file: KGTKFiles, edge_file: KGTKFiles, qualifier_file: KGTKFiles, output_file: KGTKFiles, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.exports.exportwikidata import ExportWikidata from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions node_kgtk_file: Path = KGTKArgumentParser.get_input_file( node_file, who="KGTK node file", default_stdin=False) edge_kgtk_file: Path = KGTKArgumentParser.get_input_file( edge_file, who="KGTK edge file", default_stdin=False) qualifier_kgtk_file: Path = KGTKArgumentParser.get_input_file( qualifier_file, who="KGTK qualifier file", default_stdin=False) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--node-file=%s" % str(node_kgtk_file), file=error_file, flush=True) print("--edge-file=%s" % str(edge_kgtk_file), file=error_file, flush=True) print("--qualifier-file=%s" % str(qualifier_kgtk_file), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: ew: ExportWikidata = ExportWikidata( node_file_path=node_kgtk_file, edge_file_path=edge_kgtk_file, qualifier_file_path=qualifier_kgtk_file, output_file_path=output_kgtk_file, reader_options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) ew.process() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run(input_file: KGTKFiles, output_file: KGTKFiles, mapping_file: KGTKFiles, unmodified_edges_file: KGTKFiles, activated_mapping_file: KGTKFiles, rejected_mapping_file: KGTKFiles, confidence_column_name: str, require_confidence: bool, default_confidence_str: typing.Optional[str], confidence_threshold: float, same_as_item_label: str, same_as_property_label: str, allow_exact_duplicates: bool, allow_idempotent_mapping: bool, split_output_mode: bool, modified_pattern: str, node1_column_name: typing.Optional[str], label_column_name: typing.Optional[str], node2_column_name: typing.Optional[str], mapping_rule_mode: str, mapping_node1_column_name: typing.Optional[str], mapping_label_column_name: typing.Optional[str], mapping_node2_column_name: typing.Optional[str], errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) mapping_kgtk_file: Path = KGTKArgumentParser.get_input_file(mapping_file, who="KGTK mappping file") unmodified_edges_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_edges_file, who="KGTK unmodified edges output file") activated_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(activated_mapping_file, who="KGTK activated mapping output file") rejected_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(rejected_mapping_file, who="KGTK rejected mapping output file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True) mapping_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="mapping", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % repr(str(input_kgtk_file)), file=error_file, flush=True) print("--output-file=%s" % repr(str(output_kgtk_file)), file=error_file, flush=True) print("--mapping-file=%s" % repr(str(mapping_kgtk_file)), file=error_file, flush=True) if unmodified_edges_kgtk_file is not None: print("--unmodified-edges-file=%s" % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True) if activated_mapping_kgtk_file is not None: print("--activated-mapping-edges-file=%s" % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True) if rejected_mapping_kgtk_file is not None: print("--rejected-mapping-edges-file=%s" % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True) print("--confidence-column=%s" % repr(confidence_column_name), file=error_file, flush=True) print("--require-confidence=%s" % repr(require_confidence), file=error_file, flush=True) if default_confidence_str is not None: print("--default-confidence-value=%s" % default_confidence_str, file=error_file, flush=True) print("--threshold=%f" % confidence_threshold, file=error_file, flush=True) print("--same-as-item-label=%s" % repr(same_as_item_label), file=error_file, flush=True) print("--same-as-property-label=%s" % repr(same_as_property_label), file=error_file, flush=True) print("--allow-exact-duplicates=%s" % repr(allow_exact_duplicates), file=error_file, flush=True) print("--allow-idempotent-actions=%s" % repr(allow_idempotent_mapping), file=error_file, flush=True) print("--split-output-mode=%s" % repr(split_output_mode), file=error_file, flush=True) print("--modified-pattern=%s" % repr(modified_pattern), file=error_file, flush=True) if node1_column_name is not None: print("--node1-column-=%s" % repr(node1_column_name), file=error_file, flush=True) if label_column_name is not None: print("--label-column-=%s" % repr(label_column_name), file=error_file, flush=True) if node2_column_name is not None: print("--node2-column-=%s" % repr(node2_column_name), file=error_file, flush=True) print("--mapping-rule-mode=%s" % repr(mapping_rule_mode), file=error_file, flush=True) if mapping_node1_column_name is not None: print("--mapping-node1-column-=%s" % repr(mapping_node1_column_name), file=error_file, flush=True) if mapping_label_column_name is not None: print("--mapping-label-column-=%s" % repr(mapping_label_column_name), file=error_file, flush=True) if mapping_node2_column_name is not None: print("--mapping-node2-column-=%s" % repr(mapping_node2_column_name), file=error_file, flush=True) input_reader_options.show(out=error_file, who="input") mapping_reader_options.show(out=error_file, who="mapping") value_options.show(out=error_file) print("=======", file=error_file, flush=True) default_confidence_value: typing.Optional[float] = None if default_confidence_str is not None: try: default_confidence_value = float(default_confidence_str) except: raise KGTKException("--default-confidence-value=%s is invalid" % repr(default_confidence_str)) try: if verbose: print("Opening the mapping file %s." % repr(str(mapping_kgtk_file)), file=error_file, flush=True) mkr: KgtkReader = KgtkReader.open(mapping_kgtk_file, options=mapping_reader_options, value_options = value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) trouble = False mapping_node1_idx: int = mkr.get_node1_column_index(mapping_node1_column_name) mapping_label_idx: int = mkr.get_label_column_index(mapping_label_column_name) mapping_node2_idx: int = mkr.get_node2_column_index(mapping_node2_column_name) if mapping_node1_idx < 0: trouble = True print("Error: Cannot find the mapping file node1 column.", file=error_file, flush=True) if mapping_label_idx < 0 and mapping_rule_mode == "normal": trouble = True print("Error: Cannot find the mapping file label column.", file=error_file, flush=True) if mapping_node2_idx < 0: trouble = True print("Error: Cannot find the mapping file node2 column.", file=error_file, flush=True) if trouble: # Clean up: mkr.close() raise KGTKException("Missing columns in the mapping file.") confidence_column_idx: int = mkr.column_name_map.get(confidence_column_name, -1) if require_confidence and confidence_column_idx < 0: mkr.close() raise KGTKException("The mapping file does not have a confidence column, and confidence is required.") rmkw: typing.Optional[KgtkWriter] = None if rejected_mapping_kgtk_file is not None: if verbose: print("Opening the rejected mapping edges file %s." % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True) rmkw = KgtkWriter.open(mkr.column_names, rejected_mapping_kgtk_file, mode=KgtkWriter.Mode[mkr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) # Mapping structures: item_map: typing.MutableMapping[str, str] = dict() item_line_map: typing.MutableMapping[str, int] = dict() property_map: typing.MutableMapping[str, str] = dict() property_line_map: typing.MutableMapping[str, int] = dict() mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict() activated_mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict() # Read the mapping file. if verbose: print("Processing the mapping file.", file=error_file, flush=True) mapping_confidence_exclusions: int = 0 mapping_idempotent_exclusions: int = 0 mapping_errors: int = 0 mapping_line_number: int = 0 mrow: typing.List[str] for mrow in mkr: mapping_line_number += 1 mapping_node1: str = mrow[mapping_node1_idx] mapping_label: str = mrow[mapping_label_idx] if mapping_rule_mode == "normal" else "" mapping_node2: str = mrow[mapping_node2_idx] mapping_confidence: typing.Optional[float] = default_confidence_value if confidence_column_idx >= 0: confidence_value_str: str = mrow[confidence_column_idx] if len(confidence_value_str) == 0: if require_confidence: print("In line %d of the mapping file: the required confidence value is missing" % (mapping_line_number), file=error_file, flush=True) mapping_errors += 1 continue else: try: mapping_confidence = float(confidence_value_str) except ValueError: print("In line %d of the mapping file: cannot parse confidence value %s" % (mapping_line_number, repr(mrow[confidence_column_idx])), file=error_file, flush=True) mapping_errors += 1 continue if mapping_confidence is not None and mapping_confidence < confidence_threshold: mapping_confidence_exclusions += 1 if rmkw is not None: rmkw.write(mrow) continue if mapping_node1 == mapping_node2 and not allow_idempotent_mapping: mapping_idempotent_exclusions += 1 continue if mapping_rule_mode == "same-as-item" or mapping_label == same_as_item_label: if mapping_node1 in item_map: if mapping_node2 != item_map[mapping_node1] or not allow_exact_duplicates: print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label, repr(mapping_node1), mapping_line_number, item_line_map[mapping_node1]), file=error_file, flush=True) mapping_errors += 1 continue item_map[mapping_node1] = mapping_node2 item_line_map[mapping_node1] = mapping_line_number mapping_rows[mapping_line_number] = mrow.copy() elif mapping_rule_mode == "same-as-property" or mapping_label == same_as_property_label: if mapping_node1 in property_map: if mapping_node2 != property_map[mapping_node1] or not allow_exact_duplicates: print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label, repr(mapping_node1), mapping_line_number, property_line_map[mapping_node1]), file=error_file, flush=True) mapping_errors += 1 continue property_map[mapping_node1] = mapping_node2 property_line_map[mapping_node1] = mapping_line_number mapping_rows[mapping_line_number] = mrow.copy() else: print("Unknown mapping action %s at line %d of mapping file %s" % (mapping_label, mapping_line_number, repr(str(mapping_kgtk_file))), file=error_file, flush=True) mapping_errors += 1 continue # Close the mapping file. mkr.close() if rmkw is not None: rmkw.close() if mapping_errors > 0: raise KGTKException("%d errors detected in the mapping file %s" % (mapping_errors, repr(str(mapping_kgtk_file)))) if len(item_map) == 0 and len(property_map) == 0: raise KGTKException("Nothing read from the mapping file %s" % repr(str(mapping_kgtk_file))) if verbose: print("%d mapping lines, %d excluded for confidence, %d excluded for idempotency." % (mapping_line_number, mapping_confidence_exclusions, mapping_idempotent_exclusions), file=error_file, flush=True) print("%d item mapping rules." % len(item_map), file=error_file, flush=True) print("%d property mapping rules." % len(property_map), file=error_file, flush=True) if verbose: print("Opening the input file %s." % repr(str(input_kgtk_file)), file=error_file, flush=True) ikr: KgtkReader = KgtkReader.open(input_kgtk_file, options=input_reader_options, value_options = value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) trouble = False input_node1_idx: int = ikr.get_node1_column_index(node1_column_name) input_label_idx: int = ikr.get_label_column_index(label_column_name) input_node2_idx: int = ikr.get_node2_column_index(node2_column_name) if input_node1_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]: trouble = True print("Error: Cannot find the input file node1 column.", file=error_file, flush=True) if input_label_idx < 0 and mapping_rule_mode in ["normal", "same-as-property"]: trouble = True print("Error: Cannot find the input file label column.", file=error_file, flush=True) if input_node2_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]: trouble = True print("Error: Cannot find the input file node2 column.", file=error_file, flush=True) if trouble: # Clean up: ikr.close() raise KGTKException("Missing columns in the input file.") okw: KgtkWriter = KgtkWriter.open(ikr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[ikr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) uekw: typing.Optional[KgtkWriter] = None if unmodified_edges_kgtk_file is not None: if verbose: print("Opening the unmodified edges file %s." % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True) uekw = KgtkWriter.open(ikr.column_names, unmodified_edges_kgtk_file, mode=KgtkWriter.Mode[ikr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) amkw: typing.Optional[KgtkWriter] = None if activated_mapping_kgtk_file is not None: if verbose: print("Opening the activated mapping edges file %s." % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True) amkw = KgtkWriter.open(mkr.column_names, activated_mapping_kgtk_file, mode=KgtkWriter.Mode[mkr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) # Process each row of the input file. if verbose: print("Processing the input file.", file=error_file, flush=True) input_count: int = 0 modified_edge_count: int = 0 unmodified_edge_count: int = 0 row: typing.List[str] for row in ikr: input_count +=1 newrow: typing.List[str] = row.copy() modified_node1: bool = False modified_node2: bool = False modified_label: bool = False if mapping_rule_mode in ["normal", "same-as-item"]: input_node1: str = row[input_node1_idx] if input_node1 in item_map: newrow[input_node1_idx] = item_map[input_node1] modified_node1 = True if amkw is not None: mapping_line_number = item_line_map[input_node1] if mapping_line_number not in activated_mapping_rows: activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number] input_node2: str = row[input_node2_idx] if input_node2 in item_map: newrow[input_node2_idx] = item_map[input_node2] modified_node2 = True if amkw is not None: mapping_line_number = item_line_map[input_node2] if mapping_line_number not in activated_mapping_rows: activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number] if mapping_rule_mode in ["normal", "same-as-property"]: input_label: str = row[input_label_idx] if input_label in property_map: newrow[input_label_idx] = property_map[input_label] modified_label = True if amkw is not None: mapping_line_number = property_line_map[input_label] if mapping_line_number not in activated_mapping_rows: activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number] modified: bool if modified_pattern == "node1|label|node2": modified = modified_node1 or modified_label or modified_node2 elif modified_pattern == "node1|label": modified = modified_node1 or modified_label elif modified_pattern == "node1|node2": modified = modified_node1 or modified_node2 elif modified_pattern == "label|node2": modified = modified_label or modified_node2 elif modified_pattern == "node1": modified = modified_node1 elif modified_pattern == "label": modified = modified_label elif modified_pattern == "node2": modified = modified_node2 elif modified_pattern == "node1&label&node2": modified = modified_node1 and modified_label and modified_node2 elif modified_pattern == "node1&label": modified = modified_node1 and modified_label elif modified_pattern == "node1&node2": modified = modified_node1 and modified_node2 elif modified_pattern == "label&node2": modified = modified_label and modified_node2 else: raise KGTKException("Unrecognized modification test pattern %s" % repr(modified_pattern)) if modified: modified_edge_count += 1 okw.write(newrow) else: unmodified_edge_count += 1 if uekw is not None: uekw.write(row) if not split_output_mode: okw.write(row) # Done! ikr.close() okw.close() if verbose: print("%d edges read. %d modified, %d unmodified." % (input_count, modified_edge_count, unmodified_edge_count), file=error_file, flush=True) if uekw is not None: uekw.close() if amkw is not None: activated_count: int = 0 for mapping_line_number in sorted(activated_mapping_rows.keys()): amkw.write(activated_mapping_rows[mapping_line_number]) activated_count += 1 amkw.close() if verbose: print("%d activated mapping edges" % activated_count, file=error_file, flush=True) return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))