def cat_done(cmd, success, exit_code): # When the cat command finishes, monitor the progress of the sort command. if verbose: print("\nDone reading the input file", file=error_file, flush=True) if cmd_proc is None: return # Locate the sort command using pgrep buf = StringIO() try: sh_pgrep = sh.Command(pgrep_command) sh_pgrep("-g", cmd_proc.pgid, "--newest", sort_command, _out=buf) pgrep_output = buf.getvalue() if len(pgrep_output) == 0: if verbose: print("Unable to locate the sort command.", file=error_file, flush=True) return sort_pid = int(pgrep_output) except Exception as e: if verbose: print("Exception looking for sort command: %s" % str(e), file=error_file, flush=True) return finally: buf.close() if verbose: print("Monitoring the sort command (pid=%d)" % sort_pid, file=error_file, flush=True) progress_startup(pid=sort_pid)
def python_sort(): if numeric_columns is not None and len(numeric_columns) > 0: raise KGTKException( 'Error: the pure Python sorter does not currently support numeric column sorts.' ) if reverse_columns is not None and len(reverse_columns) > 0: raise KGTKException( 'Error: the pure Python sorter does not currently support reverse column sorts.' ) if verbose: print("Opening the input file: %s" % str(input_path), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_path, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) sort_idx: int key_idxs: typing.List[int] = [] if columns is not None and len(columns) > 0: # Process the list of column names, including splitting # comma-separated lists of column names. column_name: str for column_name in columns: column_name_2: str for column_name_2 in column_name.split(","): column_name_2 = column_name_2.strip() if len(column_name_2) == 0: continue if column_name_2.isdigit(): sort_idx = int(column_name_2) if sort_idx > len(kr.column_names): kr.close() cleanup() raise KGTKException( "Invalid column number %d (max %d)." % (sort_idx, len(kr.column_names))) key_idxs.append(sort_idx - 1) else: if column_name_2 not in kr.column_names: kr.close() cleanup() raise KGTKException("Unknown column_name %s" % column_name_2) key_idxs.append(kr.column_name_map[column_name_2]) else: if kr.is_node_file: key_idxs.append(kr.id_column_idx) elif kr.is_edge_file: if kr.id_column_idx >= 0: key_idxs.append(kr.id_column_idx) key_idxs.append(kr.node1_column_idx) key_idxs.append(kr.label_column_idx) key_idxs.append(kr.node2_column_idx) else: cleanup() raise KGTKException( "Unknown KGTK file mode, please specify the sorting columns." ) if verbose: print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]), file=error_file, flush=True) if numeric_sort and len(key_idxs) > 1: raise KGTKException( 'Error: the pure Python sorter does not currently support numeric sorts on multiple columns.' ) lines: typing.MutableMapping[typing.Union[str, float], typing.List[typing.List[str]]] = dict() progress_startup() key: typing.Union[str, float] row: typing.List[str] for row in kr: key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx] for idx in key_idxs) if numeric_sort: key = float(key) if key in lines: # There are multiple rows with the same key. Make this a stable sort. lines[key].append(row) else: lines[key] = [row] if verbose: print("\nRead %d data lines." % len(lines), file=error_file, flush=True) kw = KgtkWriter.open(kr.column_names, output_path, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) for key in sorted(lines.keys(), reverse=reverse_sort): for row in lines[key]: kw.write(row) kw.close() kr.close()
def run( input_file: KGTKFiles, output_file: KGTKFiles, columns: typing.Optional[typing.List[str]] = None, locale: str = "C", reverse_sort: bool = False, reverse_columns: typing.Optional[typing.List[str]] = None, numeric_sort: bool = False, numeric_columns: typing.Optional[typing.List[str]] = None, pure_python: bool = False, extra: typing.Optional[str] = None, bash_command: str = "bash", bzip2_command: str = "bzip2", gzip_command: str = "gzip", pgrep_command: str = "pgrep", sort_command: str = "sort", xz_command: str = "xz", errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: from io import StringIO import os from pathlib import Path import sh # type: ignore import sys import typing from kgtk.cli_entry import progress_startup from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_path: Path = KGTKArgumentParser.get_input_file(input_file) output_path: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) def python_sort(): if numeric_columns is not None and len(numeric_columns) > 0: raise KGTKException( 'Error: the pure Python sorter does not currently support numeric column sorts.' ) if reverse_columns is not None and len(reverse_columns) > 0: raise KGTKException( 'Error: the pure Python sorter does not currently support reverse column sorts.' ) if verbose: print("Opening the input file: %s" % str(input_path), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_path, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) sort_idx: int key_idxs: typing.List[int] = [] if columns is not None and len(columns) > 0: # Process the list of column names, including splitting # comma-separated lists of column names. column_name: str for column_name in columns: column_name_2: str for column_name_2 in column_name.split(","): column_name_2 = column_name_2.strip() if len(column_name_2) == 0: continue if column_name_2.isdigit(): sort_idx = int(column_name_2) if sort_idx > len(kr.column_names): kr.close() cleanup() raise KGTKException( "Invalid column number %d (max %d)." % (sort_idx, len(kr.column_names))) key_idxs.append(sort_idx - 1) else: if column_name_2 not in kr.column_names: kr.close() cleanup() raise KGTKException("Unknown column_name %s" % column_name_2) key_idxs.append(kr.column_name_map[column_name_2]) else: if kr.is_node_file: key_idxs.append(kr.id_column_idx) elif kr.is_edge_file: if kr.id_column_idx >= 0: key_idxs.append(kr.id_column_idx) key_idxs.append(kr.node1_column_idx) key_idxs.append(kr.label_column_idx) key_idxs.append(kr.node2_column_idx) else: cleanup() raise KGTKException( "Unknown KGTK file mode, please specify the sorting columns." ) if verbose: print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]), file=error_file, flush=True) if numeric_sort and len(key_idxs) > 1: raise KGTKException( 'Error: the pure Python sorter does not currently support numeric sorts on multiple columns.' ) lines: typing.MutableMapping[typing.Union[str, float], typing.List[typing.List[str]]] = dict() progress_startup() key: typing.Union[str, float] row: typing.List[str] for row in kr: key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx] for idx in key_idxs) if numeric_sort: key = float(key) if key in lines: # There are multiple rows with the same key. Make this a stable sort. lines[key].append(row) else: lines[key] = [row] if verbose: print("\nRead %d data lines." % len(lines), file=error_file, flush=True) kw = KgtkWriter.open(kr.column_names, output_path, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) for key in sorted(lines.keys(), reverse=reverse_sort): for row in lines[key]: kw.write(row) kw.close() kr.close() if pure_python: return python_sort() try: global header_read_fd global header_write_fd header_read_fd, header_write_fd = os.pipe() os.set_inheritable(header_write_fd, True) if verbose: print("header pipe: read_fd=%d write_fd=%d" % (header_read_fd, header_write_fd), file=error_file, flush=True) global sortopt_read_fd global sortopt_write_fd sortopt_read_fd, sortopt_write_fd = os.pipe() os.set_inheritable(sortopt_read_fd, True) if verbose: print("sort options pipe: read_fd=%d write_fd=%d" % (sortopt_read_fd, sortopt_write_fd), file=error_file, flush=True) locale_envar: str = "LC_ALL=%s" % locale if len(locale) > 0 else "" # Note: "read -u n", used below, is not supported by some shells. # bash and zsh support it. # ash, csh, dash, and tcsh do not. # The original standard Bourne shell, sh, does not. # ksh might do it, if the FD number is a single digit. cmd: str = "".join(( "{ IFS= read -r header ; ", # Read the header line " { printf \"%s\\n\" \"$header\" >&" + str(header_write_fd) + " ; } ; ", # Send the header to Python " printf \"%s\\n\" \"$header\" ; ", # Send the header to standard output (which may be redirected to a file, below). " IFS= read -u " + str(sortopt_read_fd) + " -r options ; ", # Read the sort command options from Python. " %s %s -t '\t' $options ; } " % ( locale_envar, sort_command ), # Sort the remaining input lines using the options read from Python. )) if str(output_path) != "-": # Do we want to compress the output? output_suffix: str = output_path.suffix.lower() if output_suffix in [".gz", ".z"]: if verbose: print("gzip output file: %s" % repr(str(output_path)), file=error_file, flush=True) cmd += " | " + gzip_command + " -" elif output_suffix in [".bz2", ".bz"]: if verbose: print("bzip2 output file: %s" % repr(str(output_path)), file=error_file, flush=True) cmd += " | " + bzip2_command + " -z" elif output_suffix in [".xz", ".lzma"]: if verbose: print("xz output file: %s" % repr(str(output_path)), file=error_file, flush=True) cmd += " | " + xz_command + " -z -" # Feed the sorted output to the named file. Otherwise, the sorted # output goes to standard output without passing through Python. cmd += " > " + repr(str(output_path)) if verbose: print("sort command: %s" % cmd, file=error_file, flush=True) global cat_proc cat_proc = None global cmd_proc cmd_proc = None def cat_done(cmd, success, exit_code): # When the cat command finishes, monitor the progress of the sort command. if verbose: print("\nDone reading the input file", file=error_file, flush=True) if cmd_proc is None: return # Locate the sort command using pgrep buf = StringIO() try: sh_pgrep = sh.Command(pgrep_command) sh_pgrep("-g", cmd_proc.pgid, "--newest", sort_command, _out=buf) pgrep_output = buf.getvalue() if len(pgrep_output) == 0: if verbose: print("Unable to locate the sort command.", file=error_file, flush=True) return sort_pid = int(pgrep_output) except Exception as e: if verbose: print("Exception looking for sort command: %s" % str(e), file=error_file, flush=True) return finally: buf.close() if verbose: print("Monitoring the sort command (pid=%d)" % sort_pid, file=error_file, flush=True) progress_startup(pid=sort_pid) if str(input_path) == "-": # Read from standard input. # # Sh version 1.13 or greater is required for _pass_fds. sh_bash = sh.Command(bash_command) cmd_proc = sh_bash("-c", cmd, _in=sys.stdin, _out=sys.stdout, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _pass_fds={header_write_fd, sortopt_read_fd}) # It would be nice to monitor the sort command here. Unfortunately, there # is a race condition that makes this difficult. We could loop until the # sort command is created, then monitor it. else: # Feed the named file into the data processing pipeline, input_suffix: str = input_path.suffix.lower() if input_suffix in [".gz", ".z"]: if verbose: print("gunzip input file: %s" % repr(str(input_path)), file=error_file, flush=True) sh_gzip = sh.Command(gzip_command) cat_proc = sh_gzip(input_path, "-dc", _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: %s -dc %s | %s" % (gzip_command, repr(str(input_path)), cmd), file=error_file, flush=True) elif input_suffix in [".bz2", ".bz"]: if verbose: print("bunzip2 input file: %s" % repr(str(input_path)), file=error_file, flush=True) sh_bzip2 = sh.Command(bzip2_command) cat_proc = sh_bzip2(input_path, "-dc", _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: %s -dc %s | %s" % (bzip2_command, repr(str(input_path)), cmd), file=error_file, flush=True) elif input_suffix in [".xz", ".lzma"]: if verbose: print("unxz input file: %s" % repr(str(input_path)), file=error_file, flush=True) sh_xz = sh.Command(xz_command) cat_proc = sh_xz(input_path, "-dc", _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: %s -dc %s | %s" % (xz_command, repr(str(input_path)), cmd), file=error_file, flush=True) else: if verbose: print("input file: %s" % repr(str(input_path)), file=error_file, flush=True) cat_proc = sh.cat(input_path, _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: cat %s | %s" % (repr(str(input_path)), cmd), file=error_file, flush=True) # If enabled, monitor the progress of reading the input file. # Since we do not have access to the pid of the sort command, # we cannot monitor the progress of the merge phases. if verbose: print("Monitoring the cat command (pid=%d)." % cat_proc.pid, file=error_file, flush=True) progress_startup(pid=cat_proc.pid) # Sh version 1.13 or greater is required for _pass_fds. sh_bash = sh.Command(bash_command) cmd_proc = sh_bash(cat_proc, "-c", cmd, _out=sys.stdout, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _pass_fds={header_write_fd, sortopt_read_fd}) # Since we do not have access to the pid of the sort command, # we cannot monitor the progress of the merge phases. if verbose: print("Running the sort script (pid=%d)." % cmd_proc.pid, file=error_file, flush=True) if verbose: print("Reading the KGTK input file header line with KgtkReader", file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( Path("<%d" % header_read_fd), options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) if verbose: print("KGTK header: %s" % " ".join(kr.column_names), file=error_file, flush=True) sort_options: str = "" if reverse_sort: sort_options += " --reverse" if numeric_sort: sort_options += " --numeric" if extra is not None and len(extra) > 0: sort_options += " " + extra # We will consume entries in reverse_columns and numeric_columns, # then complain if any are left over. if reverse_columns is not None: reverse_columns = reverse_columns[:] # Protect against modifying a shared list. if numeric_columns is not None: numeric_columns = numeric_columns[:] # Protect against modifying a shared list. column_name: str sort_idx: int if columns is not None and len(columns) > 0: # Process the list of column names, including splitting # comma-separated lists of column names. for column_name in columns: column_name_2: str for column_name_2 in column_name.split(","): column_name_2 = column_name_2.strip() if len(column_name_2) == 0: continue if column_name_2.isdigit(): sort_idx = int(column_name_2) if sort_idx > len(kr.column_names): kr.close() cleanup() raise KGTKException( "Invalid column number %d (max %d)." % (sort_idx, len(kr.column_names))) else: if column_name_2 not in kr.column_names: kr.close() cleanup() raise KGTKException("Unknown column_name %s" % repr(column_name_2)) sort_idx = kr.column_name_map[column_name_2] + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) if reverse_columns is not None and column_name_2 in reverse_columns: sort_options += "r" reverse_columns.remove(column_name_2) if numeric_columns is not None and column_name_2 in numeric_columns: sort_options += "n" numeric_columns.remove(column_name_2) else: # TODO: support the case where the column name in reverse_columns # or numeric_columns is an alias of the name used in the file header. if kr.is_node_file: sort_idx = kr.id_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.id_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) elif kr.is_edge_file: if kr.id_column_idx >= 0: sort_idx = kr.id_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.id_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) sort_idx = kr.node1_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.node1_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) sort_idx = kr.label_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.label_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) sort_idx = kr.node2_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.node2_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: numeric_columns.remove(column_name) sort_options += "n" else: cleanup() raise KGTKException( "Unknown KGTK file mode, please specify the sorting columns." ) # Check for unconsumed entries in reverse_columns and numeric_columns: if reverse_columns is not None and len(reverse_columns) > 0: raise KGTKException("Unknown reverse column(s) %s" % " ".join( [repr(column_name) for column_name in reverse_columns])) if numeric_columns is not None and len(numeric_columns) > 0: raise KGTKException("Unknown numeric column(s) %s" % " ".join( [repr(column_name) for column_name in numeric_columns])) if verbose: print("sort options: %s" % sort_options, file=error_file, flush=True) kr.close() # We are done with the KgtkReader now. # Send the sort options back to the data processing pipeline. with open(sortopt_write_fd, "w") as options_file: options_file.write(sort_options + "\n") if verbose: print("\nWaiting for the sort command to complete.\n", file=error_file, flush=True) cmd_proc.wait() if verbose: print("Cleanup.", file=error_file, flush=True) cleanup() return 0 except Exception as e: # import traceback # traceback.print_tb(sys.exc_info()[2], 10) raise KGTKException('INTERNAL ERROR: ' + type(e).__module__ + '.' + str(e) + '\n')
def python_sort(): if verbose: print("Opening the input file: %s" % str(input_path), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_path, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) sort_idx: int key_idxs: typing.List[int] = [] if columns is not None and len(columns) > 0: # Process the list of column names, including splitting # comma-separated lists of column names. column_name: str for column_name in columns: column_name_2: str for column_name_2 in column_name.split(","): column_name_2 = column_name_2.strip() if len(column_name_2) == 0: continue if column_name_2.isdigit(): sort_idx = int(column_name_2) if sort_idx > len(kr.column_names): kr.close() cleanup() raise KGTKException( "Invalid column number %d (max %d)." % (sort_idx, len(kr.column_names))) key_idxs.append(sort_idx - 1) else: if column_name_2 not in kr.column_names: kr.close() cleanup() raise KGTKException("Unknown column_name %s" % column_name_2) key_idxs.append(kr.column_name_map[column_name_2]) else: if kr.is_node_file: key_idxs.append(kr.id_column_idx) elif kr.is_edge_file: if kr.id_column_idx >= 0: key_idxs.append(kr.id_column_idx) key_idxs.append(kr.node1_column_idx) key_idxs.append(kr.label_column_idx) key_idxs.append(kr.node2_column_idx) else: cleanup() raise KGTKException( "Unknown KGTK file mode, please specify the sorting columns." ) if verbose: print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]), file=error_file, flush=True) lines: typing.MutableMapping[str, typing.List[str]] = dict() progress_startup() key: str row: typing.List[str] for row in kr: key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx] for idx in key_idxs) lines[key] = row if verbose: print("\nRead %d data lines." % len(lines), file=error_file, flush=True) kw = KgtkWriter.open(kr.column_names, output_path, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) for key in sorted(lines.keys()): kw.write(lines[key]) kw.close() kr.close()