def process(self): # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) self.id_column_idx = kr.id_column_idx # If requested, create the ID column builder. # Assemble the list of output column names. output_column_names: typing.List[str] idb: typing.Optional[KgtkIdBuilder] = None if self.build_id: if self.idbuilder_options is None: raise ValueError( "ID build requested but ID builder options are missing") idb = KgtkIdBuilder.new(kr, self.idbuilder_options) output_column_names = idb.column_names else: output_column_names = kr.column_names # Build the list of key column edges: key_idx_list: typing.List[int] = [] if len(self.key_column_names) == 0: if kr.is_edge_file: # Add the KGTK edge file required columns. key_idx_list.append(kr.node1_column_idx) key_idx_list.append(kr.label_column_idx) key_idx_list.append(kr.node2_column_idx) if not self.compact_id and kr.id_column_idx >= 0: key_idx_list.append(kr.id_column_idx) elif kr.is_node_file: # Add the KGTK node file required column: key_idx_list.append(kr.id_column_idx) else: raise ValueError( "The input file is neither an edge nor a node file. Key columns must be supplied." ) else: # Append columns to the list of key column indices, # silently removing duplicates, but complaining about unknown names. # # TODO: warn about duplicates? column_name: str for column_name in self.key_column_names: if column_name not in kr.column_name_map: raise ValueError("Column %s is not in the input file" % (repr(column_name))) key_idx: int = kr.column_name_map[column_name] if key_idx not in key_idx_list: key_idx_list.append(key_idx) if self.verbose: print("key indexes: %s" % " ".join([str(idx) for idx in key_idx_list]), file=self.error_file, flush=True) self.keep_first_idx_list.clear() if len(self.keep_first_names) > 0: keep_first_name: str for keep_first_name in self.keep_first_names: if keep_first_name not in kr.column_name_map: raise ValueError( "Keep first column %s is not in the input file" % (repr(keep_first_name))) keep_first_idx: int = kr.column_name_map[keep_first_name] if keep_first_idx in key_idx_list: raise ValueError( "Keep first column %s may not be a key column" % (repr(keep_first_name))) self.keep_first_idx_list.append(keep_first_idx) if self.verbose: print("keep first indexes: %s" % " ".join([str(idx) for idx in self.keep_first_idx_list]), file=self.error_file, flush=True) if self.deduplicate: if self.compact_id and kr.id_column_idx >= 0 and kr.id_column_idx not in self.keep_first_idx_list: self.keep_first_idx_list.append(kr.id_column_idx) # Any columns that aren't in the keep_first list and aren't # already in key_idx_list will be appended to key_idx_list: idx: int for idx in range(kr.column_count): if idx not in self.keep_first_idx_list and idx not in key_idx_list: key_idx_list.append(idx) if self.verbose: print("revised key indexes: %s" % " ".join([str(idx) for idx in key_idx_list]), file=self.error_file, flush=True) if self.verbose: key_idx_list_str: typing.List[str] = [] for key_idx in key_idx_list: key_idx_list_str.append(str(key_idx)) print("key indexes: %s" % " ".join(key_idx_list_str), file=self.error_file, flush=True) # Open the output file. ew: KgtkWriter = KgtkWriter.open( output_column_names, self.output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) # Open the optional list output file. lew: typing.Optional[KgtkWriter] = None if self.list_output_file_path is not None: lew = KgtkWriter.open( output_column_names, self.list_output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) input_line_count: int = 0 row: typing.List[str] = [] input_key: str prev_input_key: typing.Optional[str] = None going_up: typing.Optional[bool] = None if self.sorted_input: if self.verbose: print("Reading the input data from %s" % self.input_file_path, file=self.error_file, flush=True) for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if self.verify_sort: if prev_input_key is None: prev_input_key = input_key else: if going_up is None: if prev_input_key < input_key: going_up = True prev_input_key = input_key elif prev_input_key > input_key: going_up = False prev_input_key = input_key else: pass # No change in input key elif going_up: if prev_input_key < input_key: prev_input_key = input_key elif prev_input_key > input_key: raise ValueError( "Line %d sort violation going up: prev='%s' curr='%s'" % (input_line_count, prev_input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR), input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR))) else: pass # No change in input_key else: if prev_input_key > input_key: prev_input_key = input_key elif prev_input_key < input_key: raise ValueError( "Line %d sort violation going down: prev='%s' curr='%s'" % (input_line_count, prev_input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR), input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR))) else: pass # No change in input_key self.process_row(input_key, row, input_line_count, idb, ew, lew) else: if self.verbose: print("Sorting the input data from %s" % self.input_file_path, file=self.error_file, flush=True) # Map key values to lists of input and output data. input_map: typing.MutableMapping[ str, typing.List[typing.List[str]]] = {} for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if input_key in input_map: # Append the row to an existing list for that key. input_map[input_key].append(row) else: # Create a new list of rows for this key. input_map[input_key] = [row] if self.verbose: print("Processing the sorted input data", file=self.error_file, flush=True) for input_key in sorted(input_map.keys()): for row in input_map[input_key]: self.process_row(input_key, row, input_line_count, idb, ew, lew) # Flush the final row, if any. We pass the last row read for # feedback, such as an ID uniqueness violation. self.process_row("", row, input_line_count, idb, ew, lew, flush=True) if self.verbose: print("Read %d records, excluded %d records, wrote %d records." % (input_line_count, self.excluded_row_count, self.output_line_count), file=self.error_file, flush=True) if lew is not None: print("Wrote %d list ouput records." % (self.list_output_line_count), file=self.error_file, flush=True) ew.close() if lew is not None: lew.close()
def run( input_file: KGTKFiles, output_file: KGTKFiles, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys import typing from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: # First create the KgtkReader. It provides parameters used by the ID # column builder. Next, create the ID column builder, which provides a # possibly revised list of column names for the KgtkWriter. Create # the KgtkWriter. Last, process the data stream. # Open the input file. kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) # Create the ID builder. idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options) # Open the output file. ew: KgtkWriter = KgtkWriter.open(idb.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=verbose, very_verbose=very_verbose) # Process the input file, building IDs. idb.process(kr, ew) # Clean up. ew.close() kr.close() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def process(self): # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open(self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options = self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) # If requested, creat the ID column builder. # Assemble the list of output column names. output_column_names: typing.List[str] idb: typing.Optional[KgtkIdBuilder] = None if self.build_id: if self.idbuilder_options is None: raise ValueError("ID build requested but ID builder options are missing") idb = KgtkIdBuilder.new(kr, self.idbuilder_options) output_column_names = idb.column_names else: output_column_names = kr.column_names # Build the list of key column edges: key_idx_list: typing.List[int] = [ ] if kr.is_edge_file: # Add the KGTK edge file required columns. key_idx_list.append(kr.node1_column_idx) key_idx_list.append(kr.label_column_idx) key_idx_list.append(kr.node2_column_idx) if not self.compact_id and kr.id_column_idx >= 0: key_idx_list.append(kr.id_column_idx) elif kr.is_node_file: # Add the KGTK node file required column: key_idx_list.append(kr.id_column_idx) # Append additinal columns to the list of key column indixes, # silently removing duplicates, but complaining about unknown names. # # TODO: warn about duplicates? column_name: str for column_name in self.key_column_names: if column_name not in kr.column_name_map: raise ValueError("Column %s is not in the input file" % (column_name)) key_idx: int = kr.column_name_map[column_name] if key_idx not in key_idx_list: key_idx_list.append(key_idx) if self.verbose: key_idx_list_str: typing.List[str] = [ ] for key_idx in key_idx_list: key_idx_list_str.append(str(key_idx)) print("key indexes: %s" % " ".join(key_idx_list_str)) # Open the output file. ew: KgtkWriter = KgtkWriter.open(output_column_names, self.output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) input_line_count: int = 0 row: typing.List[str] = [ ] input_key: str prev_input_key: typing.Optional[str] = None going_up: typing.Optional[bool] = None if self.sorted_input: if self.verbose: print("Reading the input data from %s" % self.input_file_path, file=self.error_file, flush=True) for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if self.verify_sort: if prev_input_key is None: prev_input_key = input_key else: if going_up is None: if prev_input_key < input_key: going_up = True prev_input_key = input_key elif prev_input_key > input_key: going_up = False prev_input_key = input_key else: pass # No change in input key elif going_up: if prev_input_key < input_key: prev_input_key = input_key elif prev_input_key > input_key: raise ValueError("Line %d sort violation going up: prev='%s' curr='%s'" % (input_line_count, prev_input_key, input_key)) else: pass # No change in input_key else: if prev_input_key > input_key: prev_input_key = input_key elif prev_input_key < input_key: raise ValueError("Line %d sort violation going down: prev='%s' curr='%s'" % (input_line_count, prev_input_key, input_key)) else: pass # No change in input_key self.process_row(input_key, row, input_line_count, idb, ew) else: if self.verbose: print("Sorting the input data from %s" % self.input_file_path, file=self.error_file, flush=True) # Map key values to lists of input and output data. input_map: typing.MutableMapping[str, typing.List[typing.List[str]]] = { } for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if input_key in input_map: # Append the row to an existing list for that key. input_map[input_key].append(row) else: # Create a new list of rows for this key. input_map[input_key] = [ row ] if self.verbose: print("Processing the sorted input data", file=self.error_file, flush=True) for input_key in sorted(input_map.keys()): for row in input_map[input_key]: self.process_row(input_key, row, input_line_count, idb, ew) # Flush the final row, if any. We pass the last row read for # feedback, such as an ID uniqueness violation. self.process_row("", row, input_line_count, idb, ew, flush=True) if self.verbose: print("Read %d records, wrote %d records." % (input_line_count, self.output_line_count), file=self.error_file, flush=True) ew.close()