def process(self): kmc: KgtkMergeColumns = KgtkMergeColumns() # Is the output file an edge file, a node file, or unknown? is_edge_file: bool = False is_node_file: bool = False krs: typing.List[KgtkReader] = [] kr: KgtkReader idx: int if self.verbose: print("Starting kgtkcat pid=%d" % (os.getpid()), file=self.error_file, flush=True) if self.verbose: print("Opening the %d input files." % len(self.input_file_paths), file=self.error_file, flush=True) saw_stdin: bool = False input_file_path: Path for idx, input_file_path in enumerate(self.input_file_paths): if str(input_file_path) == "-": if saw_stdin: raise ValueError("Duplicate standard input file %d" % (idx + 1)) else: saw_stdin = False if self.verbose: print("Opening file %d: standard input" % (idx + 1), file=self.error_file, flush=True) else: if self.verbose: print("Opening file %d: %s" % (idx + 1, str(input_file_path)), file=self.error_file, flush=True) kr = KgtkReader.open( input_file_path, who="input " + str(idx + 1), options=self.reader_options, value_options=self.value_options, error_file=self.error_file, verbose=self.verbose, very_verbose=self.very_verbose, ) krs.append(kr) # Unless directed otherwise, do not merge edge files with node # files. If options.mode == KgtkReaderMode.NONE, then neither # kr.is_edge_file nor kr.is_node_file will be set and the # consistency check will be skipped. if kr.is_edge_file: if is_node_file: # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError( "Cannot merge an edge file to a node file: %s" % input_file_path) if is_edge_file == False and self.verbose: print("The output file will be an edge file.", file=self.error_file, flush=True) is_edge_file = True elif kr.is_node_file: if is_edge_file: # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError( "Cannot merge a node file to an edge file: %s" % input_file_path) if is_node_file == False and self.verbose: print("The output file will be an node file.", file=self.error_file, flush=True) is_node_file = True if self.verbose or self.very_verbose: print("Mapping the %d column names in %s." % (len(kr.column_names), input_file_path), file=self.error_file, flush=True) if self.very_verbose: print(" ".join(kr.column_names), file=self.error_file, flush=True) new_column_names: typing.List[str] = kmc.merge(kr.column_names) if self.very_verbose: print(" ".join(new_column_names), file=self.error_file, flush=True) if self.verbose or self.very_verbose: print("There are %d merged columns." % len(kmc.column_names), file=self.error_file, flush=True) if self.very_verbose: print(" ".join(kmc.column_names), file=self.error_file, flush=True) if self.output_column_names is not None: if self.verbose: print("There are %d new output column names." % len(self.output_column_names), file=self.error_file, flush=True) if len(self.output_column_names) != len(kmc.column_names): # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError( "There are %d merged columns, but %d output column names." % (len(kmc.column_names), len(self.output_column_names))) output_mode: KgtkWriter.Mode = KgtkWriter.Mode.NONE if is_edge_file: output_mode = KgtkWriter.Mode.EDGE if self.verbose: print("Opening the output edge file: %s" % str(self.output_path), file=self.error_file, flush=True) elif is_node_file: output_mode = KgtkWriter.Mode.NODE if self.verbose: print("Opening the output node file: %s" % str(self.output_path), file=self.error_file, flush=True) else: if self.verbose: print("Opening the output file: %s" % str(self.output_path), file=self.error_file, flush=True) ew: KgtkWriter = KgtkWriter.open( kmc.column_names, self.output_path, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, mode=output_mode, output_format=self.output_format, output_column_names=self.output_column_names, old_column_names=self.old_column_names, new_column_names=self.new_column_names, verbose=self.verbose, very_verbose=self.very_verbose) output_data_lines: int = 0 for idx, kr in enumerate(krs): if kr.file_path is None: # This shouldn't happen because we constrined all # input_file_path elements to be not None. However, # checking here keeps mypy happy. # # TODO: throw a better exception. # # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError("Missing file path.") input_file_path = kr.file_path if self.verbose: print("Copying data from file %d: %s" % (idx + 1, input_file_path), file=self.error_file, flush=True) shuffle_list: typing.List[int] = ew.build_shuffle_list( kmc.new_column_name_lists[idx]) input_data_lines: int = 0 row: typing.List[str] for row in kr: input_data_lines += 1 output_data_lines += 1 ew.write(row, shuffle_list=shuffle_list) # Flush the output file so far: ew.flush() if self.verbose: print("Read %d data lines from file %d: %s" % (input_data_lines, idx + 1, input_file_path), file=self.error_file, flush=True) if self.verbose: print("Wrote %d lines total from %d files" % (output_data_lines, len(krs)), file=self.error_file, flush=True) # Close the open files. ew.close() for kr2 in krs: kr2.close()
def process(self): if self.verbose: print("Opening the left edge file: %s" % str(self.left_file_path), file=self.error_file, flush=True) left_kr: KgtkReader = KgtkReader.open(self.left_file_path, who="left input", options=self.left_reader_options, value_options = self.value_options, error_file=self.error_file, verbose=self.verbose, very_verbose=self.very_verbose ) if self.verbose: print("Opening the right edge file: %s" % str(self.right_file_path), file=self.error_file, flush=True) right_kr: KgtkReader = KgtkReader.open(self.right_file_path, who="right input", options=self.right_reader_options, value_options = self.value_options, error_file=self.error_file, verbose=self.verbose, very_verbose=self.very_verbose ) if not self.ok_to_join(left_kr, right_kr): left_kr.close() right_kr.close() return 1 left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, self.LEFT, self.left_join_columns) right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, self.RIGHT, self.right_join_columns) if len(left_join_idx_list) != len(right_join_idx_list): print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list)), file=self.error_file, flush=True) left_kr.close() right_kr.close() return 1 # This might open the input files for a second time. This won't work with stdin. joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets(left_join_idx_list, right_join_idx_list) if self.verbose: print("Mapping the column names for the join.", file=self.error_file, flush=True) kmc: KgtkMergeColumns = KgtkMergeColumns() kmc.merge(left_kr.column_names, prefix=self.left_prefix) right_column_names: typing.List[str] = kmc.merge(right_kr.column_names, prefix=self.right_prefix) joined_column_names: typing.List[str] = kmc.column_names if self.verbose: print(" left columns: %s" % " ".join(left_kr.column_names), file=self.error_file, flush=True) print(" right columns: %s" % " ".join(right_kr.column_names), file=self.error_file, flush=True) print("mapped right columns: %s" % " ".join(right_column_names), file=self.error_file, flush=True) print(" joined columns: %s" % " ".join(joined_column_names), file=self.error_file, flush=True) if self.verbose: print("Opening the output edge file: %s" % str(self.output_path), file=self.error_file, flush=True) ew: KgtkWriter = KgtkWriter.open(joined_column_names, self.output_path, mode=left_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) output_data_lines: int = 0 left_data_lines_read: int = 0 left_data_lines_kept: int = 0 right_data_lines_read: int = 0 right_data_lines_kept: int = 0 if self.verbose: print("Processing the left input file: %s" % str(self.left_file_path), file=self.error_file, flush=True) row: typing.List[str] for row in left_kr: left_data_lines_read += 1 if joined_key_set is None: ew.write(row) output_data_lines += 1 left_data_lines_kept += 1 else: left_key: str = self.build_join_key(left_kr, left_join_idx_list, row) if left_key in joined_key_set: ew.write(row) output_data_lines += 1 left_data_lines_kept += 1 # Flush the output file so far: ew.flush() if self.verbose: print("Processing the right input file: %s" % str(self.right_file_path), file=self.error_file, flush=True) right_shuffle_list: typing.List[int] = ew.build_shuffle_list(right_column_names) for row in right_kr: right_data_lines_read += 1 if joined_key_set is None: ew.write(row, shuffle_list=right_shuffle_list) output_data_lines += 1 right_data_lines_kept += 1 else: right_key: str = self.build_join_key(right_kr, right_join_idx_list, row) if right_key in joined_key_set: ew.write(row, shuffle_list=right_shuffle_list) output_data_lines += 1 right_data_lines_kept += 1 ew.close() if self.verbose: print("The join is complete", file=self.error_file, flush=True) print("%d left input data lines read, %d kept" % (left_data_lines_read, left_data_lines_kept), file=self.error_file, flush=True) print("%d right input data lines read, %d kept" % (right_data_lines_read, right_data_lines_kept), file=self.error_file, flush=True) print("%d data lines written." % output_data_lines, file=self.error_file, flush=True)