def process(self): # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) ikr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) # If supplied, open the label file. lkr: typing.Optional[KgtkReader] = None if self.label_file_path is not None: if self.verbose: if self.input_file_path is not None: print("Opening the label file: %s" % self.label_file_path, file=self.error_file, flush=True) else: print("Reading the label data from stdin", file=self.error_file, flush=True) lkr = KgtkReader.open( self.label_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.lift_column_names is not None and len(self.lift_column_names) == 1 and \ not self.suppress_empty_columns and \ self.input_is_presorted and \ self.labels_are_presorted and \ lkr is not None: self.process_as_merge(ikr, lkr) else: self.process_in_memory(ikr, lkr)
def process(self): # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) output_columns: typing.List[str] if self.output_format == self.EDGE_FORMAT: output_columns = ["node1", "label", "node2"] elif self.output_format == self.NODE_ONLY_FORMAT: output_columns = [ "id" ] elif self.output_format == self.NODE_COUNTS_FORMAT: output_columns = [ "id", self.label_value ] elif self.output_format == self.NODE_FORMAT: output_columns = [ "id" ] # Add more later else: raise ValueError("Unknown output format %s" % str(self.output_format)) kr: KgtkReader = KgtkReader.open(self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options = self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) column_idxs: typing.List[int] if self.column_names is None: if kr.node2_column_idx < 0: raise ValueError("No node2 default column name in the input file.") column_idxs = [ kr.node2_column_idx ] else: column_idxs = [ ] column_name: str for column_name in self.column_names: if column_name not in kr.column_name_map: raise ValueError("Column %s is not in the input file" % (column_name)) column_idxs.append(kr.column_name_map[column_name]) where_column_idx: int = -1 where_value_set: typing.Set[str] = { } if self.where_column_name is not None: if self.where_column_name not in kr.column_name_map: raise ValueError("Where column '%s' is not in the input file." % (self.where_column_name)) where_column_idx = kr.column_name_map[self.where_column_name] if self.where_values is None or len(self.where_values) == 0: raise ValueError("Where column '%s' but no values to test." % (self.where_column_name)) else: where_value_set = set(self.where_values) if self.presorted and self.output_format != self.NODE_FORMAT and len(column_idxs) == 1: self.process_presorted(output_columns, kr, column_idxs[0], where_column_idx, where_value_set) else: self.process_unsorted(output_columns, kr, column_idxs, where_column_idx, where_value_set)
def extract_join_key_set( self, file_path: Path, who: str, join_idx_list: typing.List[int]) -> typing.Set[str]: if self.verbose: print("Extracting the join key set from the %s input file: %s" % (who, str(file_path)), file=self.error_file, flush=True) reader_options: typing.Optional[KgtkReaderOptions] if who == self.LEFT: reader_options = self.left_reader_options else: reader_options = self.right_reader_options kr: KgtkReader = KgtkReader.open(file_path, who=who + " input", options=reader_options, value_options=self.value_options, error_file=self.error_file, verbose=self.verbose, very_verbose=self.very_verbose) if len(join_idx_list) == 1: # This uses optimized code: return self.single_column_key_set( kr, join_idx_list[0]) # closes er file else: return self.multi_column_key_set(kr, join_idx_list) # closes er file
def read_metadata_file(metadata_file): kr: KgtkReader = KgtkReader.open(Path(metadata_file), error_file=sys.stderr, mode=KgtkReaderMode.EDGE ) node1_idx = kr.column_name_map['node1'] node2_idx = kr.column_name_map['node2'] label_idx = kr.column_name_map['label'] sorting_metadata = {} for row in kr: node1 = row[node1_idx] label = row[label_idx] node2 = row[node2_idx] if label == 'P7482' and node2 == 'Q108739856': PROFILED_PROPERTY_METADATA[node1] = 1 if node1 not in sorting_metadata and '-' not in node1: sorting_metadata[node1] = dict() if '-' in node1: node1 = node1.split('-')[0] if label == 'datatype': label = 'qualifier_datatype' prop_val_dict = sorting_metadata.get(node1, None) if prop_val_dict is not None: prop_val_dict[label] = node2 return sorting_metadata
def process(self): # Open the input file. if self.verbose: print("Opening the input file: %s" % str(self.input_file_path), file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.verbose: print("Opening the output file: %s" % str(self.output_file_path), file=self.error_file, flush=True) # Open the output file. kw: KgtkWriter = KgtkWriter.open(kr.column_names, self.output_file_path, mode=KgtkWriter.Mode[kr.mode.name], require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) # here kw has one line already where PBG doesn't need it, input_line_count: int = 0 if self.verbose: print("Processing the input records.", file=self.error_file, flush=True) # node1 relation node2 node1_index = kr.get_node1_column_index() node2_index = kr.get_node2_column_index() relation_index = kr.get_id_column_index('relation') row: typing.List[str] # delete header kw.file_out.seek(0) # set the cursor to the top of the file kw.file_out.truncate() # truncate following part == delete first line # print(kw.file_out.tell()) for row in kr: input_line_count += 1 kw.write([row[node1_index], row[relation_index], row[node2_index]]) if self.verbose: print("Processed %d records." % (input_line_count), file=self.error_file, flush=True) kw.close()
def main(): """ Test the KGTK edge file writer. TODO: full reader options. TODO: --show-options """ parser = ArgumentParser() parser.add_argument(dest="input_kgtk_file", help="The KGTK file to read", type=Path, nargs="?") parser.add_argument(dest="output_kgtk_file", help="The KGTK file to write", type=Path, nargs="?") parser.add_argument( "--header-error-action", dest="header_error_action", help="The action to take when a header error is detected Only ERROR or EXIT are supported.", type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT) parser.add_argument( "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in a subthread.", action='store_true') parser.add_argument( "--input-mode", dest="input_mode", help="Determine the input KGTK file mode.", type=KgtkReader.Mode, action=EnumNameAction, default=KgtkReader.Mode.AUTO) parser.add_argument( "--output-mode", dest="output_mode", help="Determine the output KGTK file mode.", type=KgtkWriter.Mode, action=EnumNameAction, default=KgtkWriter.Mode.AUTO) parser.add_argument( "--output-format", dest="output_format", help="The file format (default=kgtk)", type=str) parser.add_argument( "--output-columns", dest="output_column_names", help="Rename all output columns. (default=%(default)s)", type=str, nargs='+') parser.add_argument( "--old-columns", dest="old_column_names", help="Rename seleted output columns: old names. (default=%(default)s)", type=str, nargs='+') parser.add_argument( "--new-columns", dest="new_column_names", help="Rename seleted output columns: new names. (default=%(default)s)", type=str, nargs='+') parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true') parser.add_argument( "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true') args = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr kr: KgtkReader = KgtkReader.open(args.input_kgtk_file, error_file=error_file, header_error_action=args.header_error_action, gzip_in_parallel=args.gzip_in_parallel, mode=args.input_mode, verbose=args.verbose, very_verbose=args.very_verbose) kw: KgtkWriter = KgtkWriter.open(kr.column_names, args.output_kgtk_file, error_file=error_file, gzip_in_parallel=args.gzip_in_parallel, header_error_action=args.header_error_action, mode=args.output_mode, output_format=args.output_format, output_column_names=args.output_column_names, old_column_names=args.old_column_names, new_column_names=args.new_column_names, verbose=args.verbose, very_verbose=args.very_verbose) line_count: int = 0 row: typing.List[str] for row in kr: kw.write(row) line_count += 1 kw.close() if args.verbose: print("Copied %d lines" % line_count, file=error_file, flush=True)
def process(self): input_kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, who="input", options=self.input_reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) input_key_columns: typing.List[int] = self.get_key_columns( input_kr, "input") label_col_idx = input_key_columns[1] label = '{}{}'.format('c', label_col_idx) g = load_graph_from_csv(str(input_kr.file_path), not (self.undirected), skip_first=not (self.no_header), hashed=True, csv_options={'delimiter': '\t'}, ecols=(input_key_columns[0], input_key_columns[2])) es = [] header = ['node1', 'label', 'node2'] if self.properties: properties = self.properties.split(',') for e in properties: es += (find_edge(g, g.edge_properties[label], e)) g.clear_edges() g.add_edge_list(list(set(es))) comp, hist = label_components(g, directed=self.strong) ew: KgtkWriter = KgtkWriter.open(header, self.output_file_path, mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) for v, c in enumerate(comp): ew.write([ g.vertex_properties['name'][v], 'connected_component', str(c) ])
def process(self): # Open the input file. if self.verbose: print("Opening the input file: %s" % str(self.input_file_path), file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open(self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options = self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.verbose: print("Opening the output file: %s" % str(self.output_file_path), file=self.error_file, flush=True) # Open the output file. kw: KgtkWriter = KgtkWriter.open(kr.column_names, self.output_file_path, mode=KgtkWriter.Mode[kr.mode.name], require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) input_line_count: int = 0 if self.verbose: print("Processing the input records.", file=self.error_file, flush=True) row: typing.List[str] for row in kr: input_line_count += 1 kw.write(row) if self.verbose: print("Processed %d records." % (input_line_count), file=self.error_file, flush=True) kw.close()
def run( input_file: KGTKFiles, output_file: KGTKFiles, output_format: typing.Optional[str], column_names: typing.List[str], omit_remaining_columns: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True) print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True) if output_format is not None: print("--output-format=%s" % output_format, file=error_file, flush=True) print("--columns %s" % " ".join(column_names), file=error_file, flush=True) print("--trim=%s" % str(omit_remaining_columns), file=error_file, flush=True) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: if verbose: print("Opening the input file %s" % str(input_kgtk_file), file=error_file, flush=True) kr = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) remaining_names: typing.List[str] = kr.column_names.copy() reordered_names: typing.List[str] = [] save_reordered_names: typing.Optional[typing.List[str]] = None ellipses: str = "..." # All unmentioned columns ranger: str = ".." # All columns between two columns. saw_ranger: bool = False column_name: str for column_name in column_names: if column_name == ellipses: if save_reordered_names is not None: raise KGTKException("Elipses may appear only once") if saw_ranger: raise KGTKException( "ELipses may not appear directly after a range operator ('..')." ) save_reordered_names = reordered_names reordered_names = [] continue if column_name == ranger: if len(reordered_names) == 0: raise KGTKException( "The column range operator ('..') may not appear without a preceeding column name." ) saw_ranger = True continue if column_name not in kr.column_names: raise KGTKException("Unknown column name '%s'." % column_name) if column_name not in remaining_names: raise KGTKException( "Column name '%s' was duplicated in the list." % column_name) if saw_ranger: saw_ranger = False prior_column_name: str = reordered_names[-1] prior_column_idx: int = kr.column_name_map[prior_column_name] column_name_idx: int = kr.column_name_map[column_name] start_idx: int end_idx: int idx_inc: int if column_name_idx > prior_column_idx: start_idx = prior_column_idx + 1 end_idx = column_name_idx - 1 idx_inc = 1 else: start_idx = prior_column_idx - 1 end_idx = column_name_idx + 1 idx_inc = -1 idx: int = start_idx while idx <= end_idx: idx_column_name: str = kr.column_names[idx] if idx_column_name not in remaining_names: raise KGTKException( "Column name '%s' (%s .. %s) was duplicated in the list." % (column_name, prior_column_name, column_name)) reordered_names.append(idx_column_name) remaining_names.remove(idx_column_name) idx += idx_inc reordered_names.append(column_name) remaining_names.remove(column_name) if saw_ranger: raise KGTKException( "The column ranger operator ('..') may not end the list of column names." ) if len(remaining_names) > 0 and save_reordered_names is None: # There are remaining column names and the ellipses was not seen. if not omit_remaining_columns: raise KGTKException( "No ellipses, and the following columns not accounted for: %s" % " ".join(remaining_names)) else: if verbose: print("Omitting the following columns: %s" % " ".join(remaining_names), file=error_file, flush=True) if save_reordered_names is not None: if len(remaining_names) > 0: save_reordered_names.extend(remaining_names) if len(reordered_names) > 0: save_reordered_names.extend(reordered_names) reordered_names = save_reordered_names if verbose: print("Opening the output file %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open( reordered_names, output_kgtk_file, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, mode=KgtkWriter.Mode[kr.mode.name], output_format=output_format, verbose=verbose, very_verbose=very_verbose, ) shuffle_list: typing.List = kw.build_shuffle_list(kr.column_names) input_data_lines: int = 0 row: typing.List[str] for row in kr: input_data_lines += 1 kw.write(row, shuffle_list=shuffle_list) # Flush the output file so far: kw.flush() if verbose: print("Read %d data lines from file %s" % (input_data_lines, input_kgtk_file), file=error_file, flush=True) kw.close() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run(input_file: KGTKFiles, output_file: KGTKFiles, mapping_file: KGTKFiles, unmodified_edges_file: KGTKFiles, activated_mapping_file: KGTKFiles, rejected_mapping_file: KGTKFiles, confidence_column_name: str, require_confidence: bool, default_confidence_str: typing.Optional[str], confidence_threshold: float, same_as_item_label: str, same_as_property_label: str, allow_exact_duplicates: bool, allow_idempotent_mapping: bool, split_output_mode: bool, modified_pattern: str, node1_column_name: typing.Optional[str], label_column_name: typing.Optional[str], node2_column_name: typing.Optional[str], mapping_rule_mode: str, mapping_node1_column_name: typing.Optional[str], mapping_label_column_name: typing.Optional[str], mapping_node2_column_name: typing.Optional[str], errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. )->int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) mapping_kgtk_file: Path = KGTKArgumentParser.get_input_file(mapping_file, who="KGTK mappping file") unmodified_edges_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_edges_file, who="KGTK unmodified edges output file") activated_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(activated_mapping_file, who="KGTK activated mapping output file") rejected_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(rejected_mapping_file, who="KGTK rejected mapping output file") # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True) mapping_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="mapping", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % repr(str(input_kgtk_file)), file=error_file, flush=True) print("--output-file=%s" % repr(str(output_kgtk_file)), file=error_file, flush=True) print("--mapping-file=%s" % repr(str(mapping_kgtk_file)), file=error_file, flush=True) if unmodified_edges_kgtk_file is not None: print("--unmodified-edges-file=%s" % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True) if activated_mapping_kgtk_file is not None: print("--activated-mapping-edges-file=%s" % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True) if rejected_mapping_kgtk_file is not None: print("--rejected-mapping-edges-file=%s" % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True) print("--confidence-column=%s" % repr(confidence_column_name), file=error_file, flush=True) print("--require-confidence=%s" % repr(require_confidence), file=error_file, flush=True) if default_confidence_str is not None: print("--default-confidence-value=%s" % default_confidence_str, file=error_file, flush=True) print("--threshold=%f" % confidence_threshold, file=error_file, flush=True) print("--same-as-item-label=%s" % repr(same_as_item_label), file=error_file, flush=True) print("--same-as-property-label=%s" % repr(same_as_property_label), file=error_file, flush=True) print("--allow-exact-duplicates=%s" % repr(allow_exact_duplicates), file=error_file, flush=True) print("--allow-idempotent-actions=%s" % repr(allow_idempotent_mapping), file=error_file, flush=True) print("--split-output-mode=%s" % repr(split_output_mode), file=error_file, flush=True) print("--modified-pattern=%s" % repr(modified_pattern), file=error_file, flush=True) if node1_column_name is not None: print("--node1-column-=%s" % repr(node1_column_name), file=error_file, flush=True) if label_column_name is not None: print("--label-column-=%s" % repr(label_column_name), file=error_file, flush=True) if node2_column_name is not None: print("--node2-column-=%s" % repr(node2_column_name), file=error_file, flush=True) print("--mapping-rule-mode=%s" % repr(mapping_rule_mode), file=error_file, flush=True) if mapping_node1_column_name is not None: print("--mapping-node1-column-=%s" % repr(mapping_node1_column_name), file=error_file, flush=True) if mapping_label_column_name is not None: print("--mapping-label-column-=%s" % repr(mapping_label_column_name), file=error_file, flush=True) if mapping_node2_column_name is not None: print("--mapping-node2-column-=%s" % repr(mapping_node2_column_name), file=error_file, flush=True) input_reader_options.show(out=error_file, who="input") mapping_reader_options.show(out=error_file, who="mapping") value_options.show(out=error_file) print("=======", file=error_file, flush=True) default_confidence_value: typing.Optional[float] = None if default_confidence_str is not None: try: default_confidence_value = float(default_confidence_str) except: raise KGTKException("--default-confidence-value=%s is invalid" % repr(default_confidence_str)) try: if verbose: print("Opening the mapping file %s." % repr(str(mapping_kgtk_file)), file=error_file, flush=True) mkr: KgtkReader = KgtkReader.open(mapping_kgtk_file, options=mapping_reader_options, value_options = value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) trouble = False mapping_node1_idx: int = mkr.get_node1_column_index(mapping_node1_column_name) mapping_label_idx: int = mkr.get_label_column_index(mapping_label_column_name) mapping_node2_idx: int = mkr.get_node2_column_index(mapping_node2_column_name) if mapping_node1_idx < 0: trouble = True print("Error: Cannot find the mapping file node1 column.", file=error_file, flush=True) if mapping_label_idx < 0 and mapping_rule_mode == "normal": trouble = True print("Error: Cannot find the mapping file label column.", file=error_file, flush=True) if mapping_node2_idx < 0: trouble = True print("Error: Cannot find the mapping file node2 column.", file=error_file, flush=True) if trouble: # Clean up: mkr.close() raise KGTKException("Missing columns in the mapping file.") confidence_column_idx: int = mkr.column_name_map.get(confidence_column_name, -1) if require_confidence and confidence_column_idx < 0: mkr.close() raise KGTKException("The mapping file does not have a confidence column, and confidence is required.") rmkw: typing.Optional[KgtkWriter] = None if rejected_mapping_kgtk_file is not None: if verbose: print("Opening the rejected mapping edges file %s." % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True) rmkw = KgtkWriter.open(mkr.column_names, rejected_mapping_kgtk_file, mode=KgtkWriter.Mode[mkr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) # Mapping structures: item_map: typing.MutableMapping[str, str] = dict() item_line_map: typing.MutableMapping[str, int] = dict() property_map: typing.MutableMapping[str, str] = dict() property_line_map: typing.MutableMapping[str, int] = dict() mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict() activated_mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict() # Read the mapping file. if verbose: print("Processing the mapping file.", file=error_file, flush=True) mapping_confidence_exclusions: int = 0 mapping_idempotent_exclusions: int = 0 mapping_errors: int = 0 mapping_line_number: int = 0 mrow: typing.List[str] for mrow in mkr: mapping_line_number += 1 mapping_node1: str = mrow[mapping_node1_idx] mapping_label: str = mrow[mapping_label_idx] if mapping_rule_mode == "normal" else "" mapping_node2: str = mrow[mapping_node2_idx] mapping_confidence: typing.Optional[float] = default_confidence_value if confidence_column_idx >= 0: confidence_value_str: str = mrow[confidence_column_idx] if len(confidence_value_str) == 0: if require_confidence: print("In line %d of the mapping file: the required confidence value is missing" % (mapping_line_number), file=error_file, flush=True) mapping_errors += 1 continue else: try: mapping_confidence = float(confidence_value_str) except ValueError: print("In line %d of the mapping file: cannot parse confidence value %s" % (mapping_line_number, repr(mrow[confidence_column_idx])), file=error_file, flush=True) mapping_errors += 1 continue if mapping_confidence is not None and mapping_confidence < confidence_threshold: mapping_confidence_exclusions += 1 if rmkw is not None: rmkw.write(mrow) continue if mapping_node1 == mapping_node2 and not allow_idempotent_mapping: mapping_idempotent_exclusions += 1 continue if mapping_rule_mode == "same-as-item" or mapping_label == same_as_item_label: if mapping_node1 in item_map: if mapping_node2 != item_map[mapping_node1] or not allow_exact_duplicates: print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label, repr(mapping_node1), mapping_line_number, item_line_map[mapping_node1]), file=error_file, flush=True) mapping_errors += 1 continue item_map[mapping_node1] = mapping_node2 item_line_map[mapping_node1] = mapping_line_number mapping_rows[mapping_line_number] = mrow.copy() elif mapping_rule_mode == "same-as-property" or mapping_label == same_as_property_label: if mapping_node1 in property_map: if mapping_node2 != property_map[mapping_node1] or not allow_exact_duplicates: print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label, repr(mapping_node1), mapping_line_number, property_line_map[mapping_node1]), file=error_file, flush=True) mapping_errors += 1 continue property_map[mapping_node1] = mapping_node2 property_line_map[mapping_node1] = mapping_line_number mapping_rows[mapping_line_number] = mrow.copy() else: print("Unknown mapping action %s at line %d of mapping file %s" % (mapping_label, mapping_line_number, repr(str(mapping_kgtk_file))), file=error_file, flush=True) mapping_errors += 1 continue # Close the mapping file. mkr.close() if rmkw is not None: rmkw.close() if mapping_errors > 0: raise KGTKException("%d errors detected in the mapping file %s" % (mapping_errors, repr(str(mapping_kgtk_file)))) if len(item_map) == 0 and len(property_map) == 0: raise KGTKException("Nothing read from the mapping file %s" % repr(str(mapping_kgtk_file))) if verbose: print("%d mapping lines, %d excluded for confidence, %d excluded for idempotency." % (mapping_line_number, mapping_confidence_exclusions, mapping_idempotent_exclusions), file=error_file, flush=True) print("%d item mapping rules." % len(item_map), file=error_file, flush=True) print("%d property mapping rules." % len(property_map), file=error_file, flush=True) if verbose: print("Opening the input file %s." % repr(str(input_kgtk_file)), file=error_file, flush=True) ikr: KgtkReader = KgtkReader.open(input_kgtk_file, options=input_reader_options, value_options = value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) trouble = False input_node1_idx: int = ikr.get_node1_column_index(node1_column_name) input_label_idx: int = ikr.get_label_column_index(label_column_name) input_node2_idx: int = ikr.get_node2_column_index(node2_column_name) if input_node1_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]: trouble = True print("Error: Cannot find the input file node1 column.", file=error_file, flush=True) if input_label_idx < 0 and mapping_rule_mode in ["normal", "same-as-property"]: trouble = True print("Error: Cannot find the input file label column.", file=error_file, flush=True) if input_node2_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]: trouble = True print("Error: Cannot find the input file node2 column.", file=error_file, flush=True) if trouble: # Clean up: ikr.close() raise KGTKException("Missing columns in the input file.") okw: KgtkWriter = KgtkWriter.open(ikr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[ikr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) uekw: typing.Optional[KgtkWriter] = None if unmodified_edges_kgtk_file is not None: if verbose: print("Opening the unmodified edges file %s." % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True) uekw = KgtkWriter.open(ikr.column_names, unmodified_edges_kgtk_file, mode=KgtkWriter.Mode[ikr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) amkw: typing.Optional[KgtkWriter] = None if activated_mapping_kgtk_file is not None: if verbose: print("Opening the activated mapping edges file %s." % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True) amkw = KgtkWriter.open(mkr.column_names, activated_mapping_kgtk_file, mode=KgtkWriter.Mode[mkr.mode.name], use_mgzip=input_reader_options.use_mgzip, # Hack! mgzip_threads=input_reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=verbose, very_verbose=very_verbose) # Process each row of the input file. if verbose: print("Processing the input file.", file=error_file, flush=True) input_count: int = 0 modified_edge_count: int = 0 unmodified_edge_count: int = 0 row: typing.List[str] for row in ikr: input_count +=1 newrow: typing.List[str] = row.copy() modified_node1: bool = False modified_node2: bool = False modified_label: bool = False if mapping_rule_mode in ["normal", "same-as-item"]: input_node1: str = row[input_node1_idx] if input_node1 in item_map: newrow[input_node1_idx] = item_map[input_node1] modified_node1 = True if amkw is not None: mapping_line_number = item_line_map[input_node1] if mapping_line_number not in activated_mapping_rows: activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number] input_node2: str = row[input_node2_idx] if input_node2 in item_map: newrow[input_node2_idx] = item_map[input_node2] modified_node2 = True if amkw is not None: mapping_line_number = item_line_map[input_node2] if mapping_line_number not in activated_mapping_rows: activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number] if mapping_rule_mode in ["normal", "same-as-property"]: input_label: str = row[input_label_idx] if input_label in property_map: newrow[input_label_idx] = property_map[input_label] modified_label = True if amkw is not None: mapping_line_number = property_line_map[input_label] if mapping_line_number not in activated_mapping_rows: activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number] modified: bool if modified_pattern == "node1|label|node2": modified = modified_node1 or modified_label or modified_node2 elif modified_pattern == "node1|label": modified = modified_node1 or modified_label elif modified_pattern == "node1|node2": modified = modified_node1 or modified_node2 elif modified_pattern == "label|node2": modified = modified_label or modified_node2 elif modified_pattern == "node1": modified = modified_node1 elif modified_pattern == "label": modified = modified_label elif modified_pattern == "node2": modified = modified_node2 elif modified_pattern == "node1&label&node2": modified = modified_node1 and modified_label and modified_node2 elif modified_pattern == "node1&label": modified = modified_node1 and modified_label elif modified_pattern == "node1&node2": modified = modified_node1 and modified_node2 elif modified_pattern == "label&node2": modified = modified_label and modified_node2 else: raise KGTKException("Unrecognized modification test pattern %s" % repr(modified_pattern)) if modified: modified_edge_count += 1 okw.write(newrow) else: unmodified_edge_count += 1 if uekw is not None: uekw.write(row) if not split_output_mode: okw.write(row) # Done! ikr.close() okw.close() if verbose: print("%d edges read. %d modified, %d unmodified." % (input_count, modified_edge_count, unmodified_edge_count), file=error_file, flush=True) if uekw is not None: uekw.close() if amkw is not None: activated_count: int = 0 for mapping_line_number in sorted(activated_mapping_rows.keys()): amkw.write(activated_mapping_rows[mapping_line_number]) activated_count += 1 amkw.close() if verbose: print("%d activated mapping edges" % activated_count, file=error_file, flush=True) return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, path_file: KGTKFiles, output_file: KGTKFiles, statistics_only: bool, undirected: bool, max_hops: int, source_column_name: typing.Optional[str], target_column_name: typing.Optional[str], shortest_path: bool, errors_to_stdout: bool, errors_to_stderr: bool, show_options: bool, verbose: bool, very_verbose: bool, **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. ): # import modules locally from pathlib import Path import sys from graph_tool.all import find_vertex from graph_tool.topology import all_paths from graph_tool.topology import all_shortest_paths from kgtk.gt.gt_load import load_graph_from_kgtk from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions from kgtk.exceptions import KGTKException try: # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="input", fallback=True) path_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="path", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) path_kgtk_file: Path = KGTKArgumentParser.get_input_file(path_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) id_col = 'name' if verbose: print("Reading the path file: %s" % str(path_kgtk_file), file=error_file, flush=True) pairs = [] pkr: KgtkReader = KgtkReader.open( path_kgtk_file, error_file=error_file, options=path_reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) path_source_idx: int = pkr.get_node1_column_index(source_column_name) if path_source_idx < 0: print("Missing node1 (source) column name in the path file.", file=error_file, flush=True) path_target_idx: int = pkr.get_node2_column_index(target_column_name) if path_target_idx < 0: print("Missing node1 (target) column name in the path file.", file=error_file, flush=True) if path_source_idx < 0 or path_target_idx < 0: pkr.close() raise KGTKException("Exiting due to missing columns.") paths_read: int = 0 path_row: typing.List[str] for path_row in pkr: paths_read += 1 if len(path_row) != pkr.column_count: raise KGTKException( "Exiting because line %d in the path file (%s) is the wrong length: %d columns expected, %d were read." % (paths_read, str(path_kgtk_file), pkr.column_count, len(path_row))) src: str = path_row[path_source_idx] tgt: str = path_row[path_target_idx] pairs.append((src, tgt)) pkr.close() if verbose: print("%d path rows read" % paths_read, file=error_file, flush=True) if len(pairs) == 0: print("No path pairs found, the output will be empty.", file=error_file, flush=True) elif verbose: print("%d path pairs found" % len(pairs), file=error_file, flush=True) if verbose: print("Reading the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=input_reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) sub_index: int = kr.get_node1_column_index() if sub_index < 0: print("Missing node1 (subject) column.", file=error_file, flush=True) pred_index: int = kr.get_label_column_index() if pred_index < 0: print("Missing label (predicate) column.", file=error_file, flush=True) obj_index: int = kr.get_node2_column_index() if obj_index < 0: print("Missing node2 (object) column", file=error_file, flush=True) id_index: int = kr.get_id_column_index() if id_index < 0: print("Missing id column", file=error_file, flush=True) if sub_index < 0 or pred_index < 0 or obj_index < 0 or id_index < 0: kr.close() raise KGTKException("Exiting due to missing columns.") predicate: str = kr.column_names[pred_index] id_col_name: str = kr.column_names[id_index] G = load_graph_from_kgtk(kr, directed=not undirected, ecols=(sub_index, obj_index), verbose=verbose, out=error_file) output_columns: typing.List[str] = ['node1', 'label', 'node2', 'id'] kw: KgtkWriter = KgtkWriter.open(output_columns, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, verbose=verbose, very_verbose=very_verbose) id_count = 0 if not statistics_only: for e in G.edges(): sid, oid = e lbl = G.ep[predicate][e] kw.write([ G.vp[id_col][sid], lbl, G.vp[id_col][oid], '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count) ]) id_count += 1 if verbose: print("%d edges found." % id_count, file=error_file, flush=True) id_count = 0 path_id = 0 for pair in pairs: source_node, target_node = pair source_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=source_node) target_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=target_node) if len(source_ids) == 1 and len(target_ids) == 1: source_id = source_ids[0] target_id = target_ids[0] if shortest_path: _all_paths = all_shortest_paths(G, source_id, target_id, edges=True) else: _all_paths = all_paths(G, source_id, target_id, cutoff=max_hops, edges=True) for path in _all_paths: for edge_num, an_edge in enumerate(path): edge_id = G.properties[('e', 'id')][an_edge] node1: str = 'p%d' % path_id kw.write([ node1, str(edge_num), edge_id, '{}-{}-{}'.format(node1, edge_num, id_count) ]) id_count += 1 path_id += 1 if verbose: print("%d paths contining %d edges found." % (path_id, id_count), file=error_file, flush=True) kw.close() kr.close() except Exception as e: raise KGTKException('Error: ' + str(e))
def process(self): kmc: KgtkMergeColumns = KgtkMergeColumns() # Is the output file an edge file, a node file, or unknown? is_edge_file: bool = False is_node_file: bool = False krs: typing.List[KgtkReader] = [] kr: KgtkReader idx: int if self.verbose: print("Starting kgtkcat pid=%d" % (os.getpid()), file=self.error_file, flush=True) if self.verbose: print("Opening the %d input files." % len(self.input_file_paths), file=self.error_file, flush=True) saw_stdin: bool = False input_file_path: Path for idx, input_file_path in enumerate(self.input_file_paths): if str(input_file_path) == "-": if saw_stdin: raise ValueError("Duplicate standard input file %d" % (idx + 1)) else: saw_stdin = False if self.verbose: print("Opening file %d: standard input" % (idx + 1), file=self.error_file, flush=True) else: if self.verbose: print("Opening file %d: %s" % (idx + 1, str(input_file_path)), file=self.error_file, flush=True) kr = KgtkReader.open( input_file_path, who="input " + str(idx + 1), options=self.reader_options, value_options=self.value_options, error_file=self.error_file, verbose=self.verbose, very_verbose=self.very_verbose, ) krs.append(kr) # Unless directed otherwise, do not merge edge files with node # files. If options.mode == KgtkReaderMode.NONE, then neither # kr.is_edge_file nor kr.is_node_file will be set and the # consistency check will be skipped. if kr.is_edge_file: if is_node_file: # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError( "Cannot merge an edge file to a node file: %s" % input_file_path) if is_edge_file == False and self.verbose: print("The output file will be an edge file.", file=self.error_file, flush=True) is_edge_file = True elif kr.is_node_file: if is_edge_file: # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError( "Cannot merge a node file to an edge file: %s" % input_file_path) if is_node_file == False and self.verbose: print("The output file will be an node file.", file=self.error_file, flush=True) is_node_file = True if self.verbose or self.very_verbose: print("Mapping the %d column names in %s." % (len(kr.column_names), input_file_path), file=self.error_file, flush=True) if self.very_verbose: print(" ".join(kr.column_names), file=self.error_file, flush=True) new_column_names: typing.List[str] = kmc.merge(kr.column_names) if self.very_verbose: print(" ".join(new_column_names), file=self.error_file, flush=True) if self.verbose or self.very_verbose: print("There are %d merged columns." % len(kmc.column_names), file=self.error_file, flush=True) if self.very_verbose: print(" ".join(kmc.column_names), file=self.error_file, flush=True) if self.output_column_names is not None: if self.verbose: print("There are %d new output column names." % len(self.output_column_names), file=self.error_file, flush=True) if len(self.output_column_names) != len(kmc.column_names): # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError( "There are %d merged columns, but %d output column names." % (len(kmc.column_names), len(self.output_column_names))) output_mode: KgtkWriter.Mode = KgtkWriter.Mode.NONE if is_edge_file: output_mode = KgtkWriter.Mode.EDGE if self.verbose: print("Opening the output edge file: %s" % str(self.output_path), file=self.error_file, flush=True) elif is_node_file: output_mode = KgtkWriter.Mode.NODE if self.verbose: print("Opening the output node file: %s" % str(self.output_path), file=self.error_file, flush=True) else: if self.verbose: print("Opening the output file: %s" % str(self.output_path), file=self.error_file, flush=True) ew: KgtkWriter = KgtkWriter.open( kmc.column_names, self.output_path, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, mode=output_mode, output_format=self.output_format, output_column_names=self.output_column_names, old_column_names=self.old_column_names, new_column_names=self.new_column_names, verbose=self.verbose, very_verbose=self.very_verbose) output_data_lines: int = 0 for idx, kr in enumerate(krs): if kr.file_path is None: # This shouldn't happen because we constrined all # input_file_path elements to be not None. However, # checking here keeps mypy happy. # # TODO: throw a better exception. # # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError("Missing file path.") input_file_path = kr.file_path if self.verbose: print("Copying data from file %d: %s" % (idx + 1, input_file_path), file=self.error_file, flush=True) shuffle_list: typing.List[int] = ew.build_shuffle_list( kmc.new_column_name_lists[idx]) input_data_lines: int = 0 row: typing.List[str] for row in kr: input_data_lines += 1 output_data_lines += 1 ew.write(row, shuffle_list=shuffle_list) # Flush the output file so far: ew.flush() if self.verbose: print("Read %d data lines from file %d: %s" % (input_data_lines, idx + 1, input_file_path), file=self.error_file, flush=True) if self.verbose: print("Wrote %d lines total from %d files" % (output_data_lines, len(krs)), file=self.error_file, flush=True) # Close the open files. ew.close() for kr2 in krs: kr2.close()
def python_sort(): if numeric_columns is not None and len(numeric_columns) > 0: raise KGTKException( 'Error: the pure Python sorter does not currently support numeric column sorts.' ) if reverse_columns is not None and len(reverse_columns) > 0: raise KGTKException( 'Error: the pure Python sorter does not currently support reverse column sorts.' ) if verbose: print("Opening the input file: %s" % str(input_path), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_path, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) sort_idx: int key_idxs: typing.List[int] = [] if columns is not None and len(columns) > 0: # Process the list of column names, including splitting # comma-separated lists of column names. column_name: str for column_name in columns: column_name_2: str for column_name_2 in column_name.split(","): column_name_2 = column_name_2.strip() if len(column_name_2) == 0: continue if column_name_2.isdigit(): sort_idx = int(column_name_2) if sort_idx > len(kr.column_names): kr.close() cleanup() raise KGTKException( "Invalid column number %d (max %d)." % (sort_idx, len(kr.column_names))) key_idxs.append(sort_idx - 1) else: if column_name_2 not in kr.column_names: kr.close() cleanup() raise KGTKException("Unknown column_name %s" % column_name_2) key_idxs.append(kr.column_name_map[column_name_2]) else: if kr.is_node_file: key_idxs.append(kr.id_column_idx) elif kr.is_edge_file: if kr.id_column_idx >= 0: key_idxs.append(kr.id_column_idx) key_idxs.append(kr.node1_column_idx) key_idxs.append(kr.label_column_idx) key_idxs.append(kr.node2_column_idx) else: cleanup() raise KGTKException( "Unknown KGTK file mode, please specify the sorting columns." ) if verbose: print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]), file=error_file, flush=True) if numeric_sort and len(key_idxs) > 1: raise KGTKException( 'Error: the pure Python sorter does not currently support numeric sorts on multiple columns.' ) lines: typing.MutableMapping[typing.Union[str, float], typing.List[typing.List[str]]] = dict() progress_startup() key: typing.Union[str, float] row: typing.List[str] for row in kr: key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx] for idx in key_idxs) if numeric_sort: key = float(key) if key in lines: # There are multiple rows with the same key. Make this a stable sort. lines[key].append(row) else: lines[key] = [row] if verbose: print("\nRead %d data lines." % len(lines), file=error_file, flush=True) kw = KgtkWriter.open(kr.column_names, output_path, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) for key in sorted(lines.keys(), reverse=reverse_sort): for row in lines[key]: kw.write(row) kw.close() kr.close()
def process(self): if self.verbose: print("Opening the left edge file: %s" % str(self.left_file_path), file=self.error_file, flush=True) left_kr: KgtkReader = KgtkReader.open(self.left_file_path, who="left input", options=self.left_reader_options, value_options = self.value_options, error_file=self.error_file, verbose=self.verbose, very_verbose=self.very_verbose ) if self.verbose: print("Opening the right edge file: %s" % str(self.right_file_path), file=self.error_file, flush=True) right_kr: KgtkReader = KgtkReader.open(self.right_file_path, who="right input", options=self.right_reader_options, value_options = self.value_options, error_file=self.error_file, verbose=self.verbose, very_verbose=self.very_verbose ) if not self.ok_to_join(left_kr, right_kr): left_kr.close() right_kr.close() return 1 left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, self.LEFT, self.left_join_columns) right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, self.RIGHT, self.right_join_columns) if len(left_join_idx_list) != len(right_join_idx_list): print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list)), file=self.error_file, flush=True) left_kr.close() right_kr.close() return 1 # This might open the input files for a second time. This won't work with stdin. joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets(left_join_idx_list, right_join_idx_list) if self.verbose: print("Mapping the column names for the join.", file=self.error_file, flush=True) kmc: KgtkMergeColumns = KgtkMergeColumns() kmc.merge(left_kr.column_names, prefix=self.left_prefix) right_column_names: typing.List[str] = kmc.merge(right_kr.column_names, prefix=self.right_prefix) joined_column_names: typing.List[str] = kmc.column_names if self.verbose: print(" left columns: %s" % " ".join(left_kr.column_names), file=self.error_file, flush=True) print(" right columns: %s" % " ".join(right_kr.column_names), file=self.error_file, flush=True) print("mapped right columns: %s" % " ".join(right_column_names), file=self.error_file, flush=True) print(" joined columns: %s" % " ".join(joined_column_names), file=self.error_file, flush=True) if self.verbose: print("Opening the output edge file: %s" % str(self.output_path), file=self.error_file, flush=True) ew: KgtkWriter = KgtkWriter.open(joined_column_names, self.output_path, mode=left_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) output_data_lines: int = 0 left_data_lines_read: int = 0 left_data_lines_kept: int = 0 right_data_lines_read: int = 0 right_data_lines_kept: int = 0 if self.verbose: print("Processing the left input file: %s" % str(self.left_file_path), file=self.error_file, flush=True) row: typing.List[str] for row in left_kr: left_data_lines_read += 1 if joined_key_set is None: ew.write(row) output_data_lines += 1 left_data_lines_kept += 1 else: left_key: str = self.build_join_key(left_kr, left_join_idx_list, row) if left_key in joined_key_set: ew.write(row) output_data_lines += 1 left_data_lines_kept += 1 # Flush the output file so far: ew.flush() if self.verbose: print("Processing the right input file: %s" % str(self.right_file_path), file=self.error_file, flush=True) right_shuffle_list: typing.List[int] = ew.build_shuffle_list(right_column_names) for row in right_kr: right_data_lines_read += 1 if joined_key_set is None: ew.write(row, shuffle_list=right_shuffle_list) output_data_lines += 1 right_data_lines_kept += 1 else: right_key: str = self.build_join_key(right_kr, right_join_idx_list, row) if right_key in joined_key_set: ew.write(row, shuffle_list=right_shuffle_list) output_data_lines += 1 right_data_lines_kept += 1 ew.close() if self.verbose: print("The join is complete", file=self.error_file, flush=True) print("%d left input data lines read, %d kept" % (left_data_lines_read, left_data_lines_kept), file=self.error_file, flush=True) print("%d right input data lines read, %d kept" % (right_data_lines_read, right_data_lines_kept), file=self.error_file, flush=True) print("%d data lines written." % output_data_lines, file=self.error_file, flush=True)
def main(): """ Test the KGTK ID builder. """ parser: ArgumentParser = ArgumentParser() parser.add_argument( dest="input_file_path", help="The KGTK file with the input data (default=%(default)s)", type=Path, nargs="?", default="-") parser.add_argument("-o", "--output-file", dest="output_file_path", help="The KGTK file to write (default=%(default)s).", type=Path, default="-") KgtkIdBuilderOptions.add_arguments(parser) KgtkReader.add_debug_arguments(parser) KgtkReaderOptions.add_arguments(parser, mode_options=True) KgtkValueOptions.add_arguments(parser) args: Namespace = parser.parse_args() error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args( args) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args) value_options: KgtkValueOptions = KgtkValueOptions.from_args(args) # Show the final option structures for debugging and documentation. if args.show_options: print("input: %s" % str(args.input_file_path), file=error_file, flush=True) print("--output-file=%s" % str(args.output_file_path), file=error_file, flush=True) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) # First create the KgtkReader. It provides parameters used by the ID # column builder. Next, create the ID column builder, which provides a # possibly revised list of column names for the KgtkWriter. Last, create # the KgtkWriter. # Open the input file. kr: KgtkReader = KgtkReader.open( args.input_file_path, error_file=error_file, options=reader_options, value_options=value_options, verbose=args.verbose, very_verbose=args.very_verbose, ) # Create the ID builder. idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options) # Open the output file. ew: KgtkWriter = KgtkWriter.open(idb.column_names, args.output_file_path, mode=kr.mode, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=args.verbose, very_verbose=args.very_verbose) # Process the input file, building IDs. idb.process(kr, ew) ew.close() kr.close()
def process(self): if len(self.column_name) == 0: raise ValueError("The name of the column to explode is empty.") selected_field_names: typing.List[str] = [] field_name: str if self.type_names is not None: if self.verbose: print("Validate the names of the data types to extract.", file=self.error_file, flush=True) type_name: str for type_name in self.type_names: if type_name not in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS: raise ValueError("Unknown data type name '%s'." % type_name) # Merge this KGTK data type's fields into the list of selected fields: for field_name in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS[ type_name]: if field_name not in selected_field_names: selected_field_names.append(field_name) if self.field_names is not None: # Forget the fields selected above, choose these instead: selected_field_names = [] if self.verbose: print("Validate the names of the fields to extract.", file=self.error_file, flush=True) for field_name in self.field_names: if field_name not in KgtkValueFields.FIELD_NAMES: raise ValueError("Unknown field name '%s'." % field_name) # Merge this field into the list of selected fields: if field_name not in selected_field_names: selected_field_names.append(field_name) if len(selected_field_names) == 0: raise ValueError("The list of fields to explode is empty.") # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.verbose: print("Check that the source column '%s' is present." % self.column_name, file=self.error_file, flush=True) if self.column_name not in kr.column_name_map: raise ValueError("Column name '%s' not found in the input file." % self.column_name) column_idx: int = kr.column_name_map[self.column_name] if self.verbose: print( "Build the map of exploded columns and list of new column names", file=self.error_file, flush=True) explosion: typing.MutableMapping[str, idx] = {} column_names: typing.List[str] = kr.column_names.copy() for field_name in selected_field_names: exploded_name: str = self.prefix + field_name if self.verbose: print("Field '%s' becomes '%s'" % (field_name, exploded_name), file=self.error_file, flush=True) if exploded_name in explosion: raise ValueError( "Field name '%s' is duplicated in the field list.") if exploded_name in kr.column_names: if self.overwrite_columns: existing_idx = kr.column_name_map[exploded_name] explosion[field_name] = existing_idx if self.verbose: print( "Field '%s' is overwriting existing column '%s' (idx=%d)" % (field_name, exploded_name, existing_idx), file=self.error_file, flush=True) else: raise ValueError( "Exploded column '%s' already exists and not allowed to overwrite" % exploded_name) else: column_names.append(exploded_name) exploded_idx: int = len(column_names) - 1 explosion[field_name] = exploded_idx if self.verbose: print("Field '%s' becomes new column '%s' (idx=%d)" % (field_name, exploded_name, exploded_idx), file=self.error_file, flush=True) new_column_count: int = len(column_names) - kr.column_count if self.verbose: print("%d columns + %d columns = %d columns" % (kr.column_count, new_column_count, len(column_names))) print("Explosion length: %d" % len(explosion)) # Open the output file. ew: KgtkWriter = KgtkWriter.open(column_names, self.output_file_path, mode=kr.mode, output_format=self.output_format, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) if self.verbose: print("Expanding records from %s" % self.input_file_path, file=self.error_file, flush=True) input_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 # Parse the value for the colummn being exploded: item_to_explode: str = row[column_idx] value: KgtkValue = KgtkValue(item_to_explode, options=self.value_options, parse_fields=True) value.validate() if not value.is_valid(): if self.verbose: print("Not exploding invalid item '%s' in input line %d" % (item_to_explode, input_line_count), file=self.error_file, flush=True) ew.write(row) # This will be filled to the proper length output_line_count += 1 continue if self.expand_list and value.is_list(): if self.verbose: print("Expanding a list: '%s'" % item_to_explode, file=self.error_file, flush=True) subvalue: KgtkValue for subvalue in value.get_list_items(): if self.very_verbose: print("Exploding '%s'" % subvalue.value) ew.write( self.explode(subvalue, row, explosion, new_column_count)) output_line_count += 1 else: if self.very_verbose: print("Exploding '%s'" % value.value) ew.write(self.explode(value, row, explosion, new_column_count)) output_line_count += 1 if self.verbose: print("Read %d records, wrote %d records." % (input_line_count, output_line_count), file=self.error_file, flush=True) ew.close()
def run( input_file: KGTKFiles, output_file: KGTKFiles, columns: typing.Optional[typing.List[str]], split_on_commas: bool, split_on_spaces: bool, strip_spaces: bool, all_except: bool, ignore_missing_columns: bool, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if columns is not None: print("--columns=%s" % " ".join(columns), file=error_file) print("--split-on-commas=%s" % str(split_on_commas), file=error_file) print("--split-on-spaces=%s" % str(split_on_spaces), file=error_file) print("--strip-spaces=%s" % str(strip_spaces), file=error_file) print("--all-except=%s" % str(all_except), file=error_file) print("--ignore-missing-columns=%s" % str(ignore_missing_columns), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: if columns is None: columns = [] # This simplifies matters. if split_on_spaces: # We will be very lenient, and allow space-seperated arguments # *inside* shell quoting, e.g. # # kgtk remove_columns -c 'name name2 name3' # # Do not enable this option if spaces are legal inside your # column names. columns = " ".join(columns).split() remove_columns: typing.List[str] = [] arg: str column_name: str for arg in columns: if split_on_commas: for column_name in arg.split(","): if strip_spaces: column_name = column_name.strip() if len(column_name) > 0: remove_columns.append(column_name) else: if strip_spaces: arg = arg.strip() if len(arg) > 0: remove_columns.append(arg) if verbose: if all_except: print("Removing all columns except %d columns: %s" % (len(remove_columns), " ".join(remove_columns)), file=error_file, flush=True) else: print("Removing %d columns: %s" % (len(remove_columns), " ".join(remove_columns)), file=error_file, flush=True) if len(remove_columns) == 0: raise KGTKException("No columns to remove") if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) output_column_names: typing.List[str] trouble_column_names: typing.List[str] = [] if all_except: if not ignore_missing_columns: for column_name in remove_columns: if column_name not in kr.column_names: print("Error: cannot retain unknown column '%s'." % column_name, file=error_file, flush=True) trouble_column_names.append(column_name) output_column_names = [] for column_name in kr.column_names: if column_name in remove_columns: output_column_names.append(column_name) else: output_column_names = kr.column_names.copy() for column_name in remove_columns: if column_name in output_column_names: output_column_names.remove(column_name) elif not ignore_missing_columns: print("Error: cannot remove unknown column '%s'." % column_name, file=error_file, flush=True) trouble_column_names.append(column_name) if len(trouble_column_names) > 0: raise KGTKException("Unknown columns %s" % " ".join(trouble_column_names)) if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names) input_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 kw.write(row, shuffle_list=shuffle_list) if verbose: print("Processed %d rows." % (input_line_count), file=error_file, flush=True) kw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run(input_files: KGTKFiles, errors_to_stdout: bool = False, errors_to_stderr: bool = False, header_only: bool = False, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkReaderOptions and KgtkValueOptions want. )->int: # import modules locally from kgtk.exceptions import KGTKException kgtk_files: typing.List[Path] = KGTKArgumentParser.get_input_file_list(input_files) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-files: %s" % " ".join((str(kgtk_file) for kgtk_file in kgtk_files)), file=error_file) print("--header-only=%s" % str(header_only), file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: kgtk_file: Path for kgtk_file in kgtk_files: if verbose: print("\n====================================================", flush=True) if str(kgtk_file) != "-": print("Validating '%s'" % str(kgtk_file), file=error_file, flush=True) else: print ("Validating from stdin", file=error_file, flush=True) kr: KgtkReader = KgtkReader.open(kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose) if header_only: kr.close() if verbose: print("Validated the header only.", file=error_file, flush=True) else: line_count: int = 0 row: typing.List[str] for row in kr: line_count += 1 if verbose: print("Validated %d data lines" % line_count, file=error_file, flush=True) return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, columns: typing.Optional[typing.List[str]] = None, locale: str = "C", reverse_sort: bool = False, reverse_columns: typing.Optional[typing.List[str]] = None, numeric_sort: bool = False, numeric_columns: typing.Optional[typing.List[str]] = None, pure_python: bool = False, extra: typing.Optional[str] = None, bash_command: str = "bash", bzip2_command: str = "bzip2", gzip_command: str = "gzip", pgrep_command: str = "pgrep", sort_command: str = "sort", xz_command: str = "xz", errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: from io import StringIO import os from pathlib import Path import sh # type: ignore import sys import typing from kgtk.cli_entry import progress_startup from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_path: Path = KGTKArgumentParser.get_input_file(input_file) output_path: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) def python_sort(): if numeric_columns is not None and len(numeric_columns) > 0: raise KGTKException( 'Error: the pure Python sorter does not currently support numeric column sorts.' ) if reverse_columns is not None and len(reverse_columns) > 0: raise KGTKException( 'Error: the pure Python sorter does not currently support reverse column sorts.' ) if verbose: print("Opening the input file: %s" % str(input_path), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_path, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) sort_idx: int key_idxs: typing.List[int] = [] if columns is not None and len(columns) > 0: # Process the list of column names, including splitting # comma-separated lists of column names. column_name: str for column_name in columns: column_name_2: str for column_name_2 in column_name.split(","): column_name_2 = column_name_2.strip() if len(column_name_2) == 0: continue if column_name_2.isdigit(): sort_idx = int(column_name_2) if sort_idx > len(kr.column_names): kr.close() cleanup() raise KGTKException( "Invalid column number %d (max %d)." % (sort_idx, len(kr.column_names))) key_idxs.append(sort_idx - 1) else: if column_name_2 not in kr.column_names: kr.close() cleanup() raise KGTKException("Unknown column_name %s" % column_name_2) key_idxs.append(kr.column_name_map[column_name_2]) else: if kr.is_node_file: key_idxs.append(kr.id_column_idx) elif kr.is_edge_file: if kr.id_column_idx >= 0: key_idxs.append(kr.id_column_idx) key_idxs.append(kr.node1_column_idx) key_idxs.append(kr.label_column_idx) key_idxs.append(kr.node2_column_idx) else: cleanup() raise KGTKException( "Unknown KGTK file mode, please specify the sorting columns." ) if verbose: print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]), file=error_file, flush=True) if numeric_sort and len(key_idxs) > 1: raise KGTKException( 'Error: the pure Python sorter does not currently support numeric sorts on multiple columns.' ) lines: typing.MutableMapping[typing.Union[str, float], typing.List[typing.List[str]]] = dict() progress_startup() key: typing.Union[str, float] row: typing.List[str] for row in kr: key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx] for idx in key_idxs) if numeric_sort: key = float(key) if key in lines: # There are multiple rows with the same key. Make this a stable sort. lines[key].append(row) else: lines[key] = [row] if verbose: print("\nRead %d data lines." % len(lines), file=error_file, flush=True) kw = KgtkWriter.open(kr.column_names, output_path, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) for key in sorted(lines.keys(), reverse=reverse_sort): for row in lines[key]: kw.write(row) kw.close() kr.close() if pure_python: return python_sort() try: global header_read_fd global header_write_fd header_read_fd, header_write_fd = os.pipe() os.set_inheritable(header_write_fd, True) if verbose: print("header pipe: read_fd=%d write_fd=%d" % (header_read_fd, header_write_fd), file=error_file, flush=True) global sortopt_read_fd global sortopt_write_fd sortopt_read_fd, sortopt_write_fd = os.pipe() os.set_inheritable(sortopt_read_fd, True) if verbose: print("sort options pipe: read_fd=%d write_fd=%d" % (sortopt_read_fd, sortopt_write_fd), file=error_file, flush=True) locale_envar: str = "LC_ALL=%s" % locale if len(locale) > 0 else "" # Note: "read -u n", used below, is not supported by some shells. # bash and zsh support it. # ash, csh, dash, and tcsh do not. # The original standard Bourne shell, sh, does not. # ksh might do it, if the FD number is a single digit. cmd: str = "".join(( "{ IFS= read -r header ; ", # Read the header line " { printf \"%s\\n\" \"$header\" >&" + str(header_write_fd) + " ; } ; ", # Send the header to Python " printf \"%s\\n\" \"$header\" ; ", # Send the header to standard output (which may be redirected to a file, below). " IFS= read -u " + str(sortopt_read_fd) + " -r options ; ", # Read the sort command options from Python. " %s %s -t '\t' $options ; } " % ( locale_envar, sort_command ), # Sort the remaining input lines using the options read from Python. )) if str(output_path) != "-": # Do we want to compress the output? output_suffix: str = output_path.suffix.lower() if output_suffix in [".gz", ".z"]: if verbose: print("gzip output file: %s" % repr(str(output_path)), file=error_file, flush=True) cmd += " | " + gzip_command + " -" elif output_suffix in [".bz2", ".bz"]: if verbose: print("bzip2 output file: %s" % repr(str(output_path)), file=error_file, flush=True) cmd += " | " + bzip2_command + " -z" elif output_suffix in [".xz", ".lzma"]: if verbose: print("xz output file: %s" % repr(str(output_path)), file=error_file, flush=True) cmd += " | " + xz_command + " -z -" # Feed the sorted output to the named file. Otherwise, the sorted # output goes to standard output without passing through Python. cmd += " > " + repr(str(output_path)) if verbose: print("sort command: %s" % cmd, file=error_file, flush=True) global cat_proc cat_proc = None global cmd_proc cmd_proc = None def cat_done(cmd, success, exit_code): # When the cat command finishes, monitor the progress of the sort command. if verbose: print("\nDone reading the input file", file=error_file, flush=True) if cmd_proc is None: return # Locate the sort command using pgrep buf = StringIO() try: sh_pgrep = sh.Command(pgrep_command) sh_pgrep("-g", cmd_proc.pgid, "--newest", sort_command, _out=buf) pgrep_output = buf.getvalue() if len(pgrep_output) == 0: if verbose: print("Unable to locate the sort command.", file=error_file, flush=True) return sort_pid = int(pgrep_output) except Exception as e: if verbose: print("Exception looking for sort command: %s" % str(e), file=error_file, flush=True) return finally: buf.close() if verbose: print("Monitoring the sort command (pid=%d)" % sort_pid, file=error_file, flush=True) progress_startup(pid=sort_pid) if str(input_path) == "-": # Read from standard input. # # Sh version 1.13 or greater is required for _pass_fds. sh_bash = sh.Command(bash_command) cmd_proc = sh_bash("-c", cmd, _in=sys.stdin, _out=sys.stdout, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _pass_fds={header_write_fd, sortopt_read_fd}) # It would be nice to monitor the sort command here. Unfortunately, there # is a race condition that makes this difficult. We could loop until the # sort command is created, then monitor it. else: # Feed the named file into the data processing pipeline, input_suffix: str = input_path.suffix.lower() if input_suffix in [".gz", ".z"]: if verbose: print("gunzip input file: %s" % repr(str(input_path)), file=error_file, flush=True) sh_gzip = sh.Command(gzip_command) cat_proc = sh_gzip(input_path, "-dc", _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: %s -dc %s | %s" % (gzip_command, repr(str(input_path)), cmd), file=error_file, flush=True) elif input_suffix in [".bz2", ".bz"]: if verbose: print("bunzip2 input file: %s" % repr(str(input_path)), file=error_file, flush=True) sh_bzip2 = sh.Command(bzip2_command) cat_proc = sh_bzip2(input_path, "-dc", _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: %s -dc %s | %s" % (bzip2_command, repr(str(input_path)), cmd), file=error_file, flush=True) elif input_suffix in [".xz", ".lzma"]: if verbose: print("unxz input file: %s" % repr(str(input_path)), file=error_file, flush=True) sh_xz = sh.Command(xz_command) cat_proc = sh_xz(input_path, "-dc", _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: %s -dc %s | %s" % (xz_command, repr(str(input_path)), cmd), file=error_file, flush=True) else: if verbose: print("input file: %s" % repr(str(input_path)), file=error_file, flush=True) cat_proc = sh.cat(input_path, _in=sys.stdin, _piped=True, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _done=cat_done) if verbose: print("full command: cat %s | %s" % (repr(str(input_path)), cmd), file=error_file, flush=True) # If enabled, monitor the progress of reading the input file. # Since we do not have access to the pid of the sort command, # we cannot monitor the progress of the merge phases. if verbose: print("Monitoring the cat command (pid=%d)." % cat_proc.pid, file=error_file, flush=True) progress_startup(pid=cat_proc.pid) # Sh version 1.13 or greater is required for _pass_fds. sh_bash = sh.Command(bash_command) cmd_proc = sh_bash(cat_proc, "-c", cmd, _out=sys.stdout, _err=sys.stderr, _bg=True, _bg_exc=False, _internal_bufsize=1, _pass_fds={header_write_fd, sortopt_read_fd}) # Since we do not have access to the pid of the sort command, # we cannot monitor the progress of the merge phases. if verbose: print("Running the sort script (pid=%d)." % cmd_proc.pid, file=error_file, flush=True) if verbose: print("Reading the KGTK input file header line with KgtkReader", file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( Path("<%d" % header_read_fd), options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) if verbose: print("KGTK header: %s" % " ".join(kr.column_names), file=error_file, flush=True) sort_options: str = "" if reverse_sort: sort_options += " --reverse" if numeric_sort: sort_options += " --numeric" if extra is not None and len(extra) > 0: sort_options += " " + extra # We will consume entries in reverse_columns and numeric_columns, # then complain if any are left over. if reverse_columns is not None: reverse_columns = reverse_columns[:] # Protect against modifying a shared list. if numeric_columns is not None: numeric_columns = numeric_columns[:] # Protect against modifying a shared list. column_name: str sort_idx: int if columns is not None and len(columns) > 0: # Process the list of column names, including splitting # comma-separated lists of column names. for column_name in columns: column_name_2: str for column_name_2 in column_name.split(","): column_name_2 = column_name_2.strip() if len(column_name_2) == 0: continue if column_name_2.isdigit(): sort_idx = int(column_name_2) if sort_idx > len(kr.column_names): kr.close() cleanup() raise KGTKException( "Invalid column number %d (max %d)." % (sort_idx, len(kr.column_names))) else: if column_name_2 not in kr.column_names: kr.close() cleanup() raise KGTKException("Unknown column_name %s" % repr(column_name_2)) sort_idx = kr.column_name_map[column_name_2] + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) if reverse_columns is not None and column_name_2 in reverse_columns: sort_options += "r" reverse_columns.remove(column_name_2) if numeric_columns is not None and column_name_2 in numeric_columns: sort_options += "n" numeric_columns.remove(column_name_2) else: # TODO: support the case where the column name in reverse_columns # or numeric_columns is an alias of the name used in the file header. if kr.is_node_file: sort_idx = kr.id_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.id_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) elif kr.is_edge_file: if kr.id_column_idx >= 0: sort_idx = kr.id_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.id_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) sort_idx = kr.node1_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.node1_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) sort_idx = kr.label_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.label_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: sort_options += "n" numeric_columns.remove(column_name) sort_idx = kr.node2_column_idx + 1 sort_options += " -k %d,%d" % (sort_idx, sort_idx) column_name = kr.column_names[kr.node2_column_idx] if reverse_columns is not None and column_name in reverse_columns: sort_options += "r" reverse_columns.remove(column_name) if numeric_columns is not None and column_name in numeric_columns: numeric_columns.remove(column_name) sort_options += "n" else: cleanup() raise KGTKException( "Unknown KGTK file mode, please specify the sorting columns." ) # Check for unconsumed entries in reverse_columns and numeric_columns: if reverse_columns is not None and len(reverse_columns) > 0: raise KGTKException("Unknown reverse column(s) %s" % " ".join( [repr(column_name) for column_name in reverse_columns])) if numeric_columns is not None and len(numeric_columns) > 0: raise KGTKException("Unknown numeric column(s) %s" % " ".join( [repr(column_name) for column_name in numeric_columns])) if verbose: print("sort options: %s" % sort_options, file=error_file, flush=True) kr.close() # We are done with the KgtkReader now. # Send the sort options back to the data processing pipeline. with open(sortopt_write_fd, "w") as options_file: options_file.write(sort_options + "\n") if verbose: print("\nWaiting for the sort command to complete.\n", file=error_file, flush=True) cmd_proc.wait() if verbose: print("Cleanup.", file=error_file, flush=True) cleanup() return 0 except Exception as e: # import traceback # traceback.print_tb(sys.exc_info()[2], 10) raise KGTKException('INTERNAL ERROR: ' + type(e).__module__ + '.' + str(e) + '\n')
def run( input_file: KGTKFiles, output_file: KGTKFiles, undirected: bool, compute_degrees: bool, compute_pagerank: bool, compute_hits: bool, log_file: str, statistics_only: bool, vertex_in_degree: str, vertex_out_degree: str, vertex_pagerank: str, vertex_auth: str, vertex_hubs: str, top_n: int, errors_to_stdout: bool, errors_to_stderr: bool, show_options: bool, verbose: bool, very_verbose: bool, **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. ): # import modules locally from pathlib import Path import sys from graph_tool import centrality from kgtk.exceptions import KGTKException import kgtk.gt.analysis_utils as gtanalysis from kgtk.gt.gt_load import load_graph_from_kgtk from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions v_prop_dict = { 'vertex_pagerank': vertex_pagerank, 'vertex_hubs': vertex_hubs, 'vertex_auth': vertex_auth } try: # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later directions = ['in', 'out', 'total'] id_col = 'name' output_columns = ["node1", "label", "node2", "id"] if verbose: print('loading the KGTK input file...\n', file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) sub: int = kr.get_node1_column_index() if sub < 0: print("Missing node1 (subject) column.", file=error_file, flush=True) pred: int = kr.get_label_column_index() if pred < 0: print("Missing label (predicate) column.", file=error_file, flush=True) obj: int = kr.get_node2_column_index() if obj < 0: print("Missing node2 (object) column", file=error_file, flush=True) if sub < 0 or pred < 0 or obj < 0: kr.close() raise KGTKException("Exiting due to missing columns.") predicate: str = kr.column_names[pred] G2 = load_graph_from_kgtk(kr, directed=not undirected, ecols=(sub, obj), verbose=verbose, out=error_file) if verbose: print('graph loaded! It has %d nodes and %d edges.' % (G2.num_vertices(), G2.num_edges()), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_columns, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, verbose=verbose, very_verbose=very_verbose) with open(log_file, 'w') as writer: writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) writer.write('\n###Top relations:\n') for rel, freq in gtanalysis.get_topN_relations( G2, pred_property=predicate): writer.write('%s\t%d\n' % (rel, freq)) if compute_degrees: writer.write('\n###Degrees:\n') for direction in directions: degree_data = gtanalysis.compute_node_degree_hist( G2, direction) max_degree = len(degree_data) - 1 mean_degree, std_degree = gtanalysis.compute_avg_node_degree( G2, direction) writer.write( '%s degree stats: mean=%f, std=%f, max=%d\n' % (direction, mean_degree, std_degree, max_degree)) if compute_pagerank: writer.write('\n###PageRank\n') v_pr = G2.new_vertex_property('float') centrality.pagerank(G2, prop=v_pr) G2.properties[('v', 'vertex_pagerank')] = v_pr writer.write('Max pageranks\n') result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank', top_n, id_col) for n_id, n_label, pr in result: writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr)) if compute_hits: writer.write('\n###HITS\n') hits_eig, G2.vp['vertex_hubs'], G2.vp[ 'vertex_auth'] = gtanalysis.compute_hits(G2) writer.write('HITS hubs\n') main_hubs = gtanalysis.get_topn_indices( G2, 'vertex_hubs', top_n, id_col) for n_id, n_label, hubness in main_hubs: writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness)) writer.write('HITS auth\n') main_auth = gtanalysis.get_topn_indices( G2, 'vertex_auth', top_n, id_col) for n_id, n_label, authority in main_auth: writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) id_count = 0 if not statistics_only: for e in G2.edges(): sid, oid = e lbl = G2.ep[predicate][e] kw.write([ G2.vp[id_col][sid], lbl, G2.vp[id_col][oid], '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count) ]) id_count += 1 id_count = 0 for v in G2.vertices(): v_id = G2.vp[id_col][v] kw.write([ v_id, vertex_in_degree, str(v.in_degree()), '{}-{}-{}'.format(v_id, vertex_in_degree, id_count) ]) id_count += 1 kw.write([ v_id, vertex_out_degree, str(v.out_degree()), '{}-{}-{}'.format(v_id, vertex_out_degree, id_count) ]) id_count += 1 for vprop in G2.vertex_properties.keys(): if vprop == id_col: continue kw.write([ v_id, v_prop_dict[vprop], str(G2.vp[vprop][v]), '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count) ]) id_count += 1 kw.close() kr.close() except Exception as e: raise KGTKException('Error: ' + str(e))
def process(self): # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.column_name not in kr.column_name_map: raise ValueError("Column %s is not in the input file" % (self.column_name)) column_idx: int = kr.column_name_map[self.column_name] where_column_idx: int = -1 where_value_set: typing.Set[str] = {} if self.where_column_name is not None: if self.where_column_name not in kr.column_name_map: raise ValueError( "Where column '%s' is not in the input file." % (self.where_column_name)) where_column_idx = kr.column_name_map[self.where_column_name] if self.where_values is None or len(self.where_values) == 0: raise ValueError("Where column '%s' but no values to test." % (self.where_column_name)) else: where_value_set = set(self.where_values) if self.verbose: print("Counting unique values from the %s column in %s" % (self.column_name, self.input_file_path), file=self.error_file, flush=True) input_line_count: int = 0 skip_line_count: int = 0 value_counts: typing.MutableMapping[str, int] = {} row: typing.list[str] for row in kr: input_line_count += 1 if where_column_idx >= 0: if row[where_column_idx] not in where_value_set: skip_line_count += 1 continue value: str = row[column_idx] if len(value) == 0: value = self.empty_value if len(value) > 0: value = self.prefix + value value_counts[value] = value_counts.get(value, 0) + 1 if self.verbose: print( "Read %d records, skipped %d, found %d unique non-empty values, %d empty values." % (input_line_count, skip_line_count, len(value_counts), input_line_count - len(value_counts)), file=self.error_file, flush=True) # No node mode we can't open the output file until we are done reading # the input file, because we need the list of uniqueue values to # build the column list. output_columns: typing.List[str] if self.output_format == "edge": output_columns = ["node1", "label", "node2"] elif self.output_format == "node": output_columns = ["id"] for value in sorted(value_counts.keys()): # TODO: provide a way to override this check. if value in KgtkFormat.NODE1_COLUMN_NAMES: raise ValueError( "Cannot write a KGTK node file with a column named '%s'." % value) output_columns.append(value) else: raise ValueError("Unknown output format %s" % str(self.output_format)) if self.verbose: print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True) ew: KgtkWriter = KgtkWriter.open(output_columns, self.output_file_path, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) if self.output_format == "edge": for value in sorted(value_counts.keys()): ew.write([value, self.label_value, str(value_counts[value])]) elif self.output_format == "node": row = [self.column_name] for value in sorted(value_counts.keys()): row.append(str(value_counts[value])) ew.write(row) else: raise ValueError("Unknown output format %s" % str(self.output_format)) ew.close()
def process(self): # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) self.id_column_idx = kr.id_column_idx # If requested, create the ID column builder. # Assemble the list of output column names. output_column_names: typing.List[str] idb: typing.Optional[KgtkIdBuilder] = None if self.build_id: if self.idbuilder_options is None: raise ValueError( "ID build requested but ID builder options are missing") idb = KgtkIdBuilder.new(kr, self.idbuilder_options) output_column_names = idb.column_names else: output_column_names = kr.column_names # Build the list of key column edges: key_idx_list: typing.List[int] = [] if len(self.key_column_names) == 0: if kr.is_edge_file: # Add the KGTK edge file required columns. key_idx_list.append(kr.node1_column_idx) key_idx_list.append(kr.label_column_idx) key_idx_list.append(kr.node2_column_idx) if not self.compact_id and kr.id_column_idx >= 0: key_idx_list.append(kr.id_column_idx) elif kr.is_node_file: # Add the KGTK node file required column: key_idx_list.append(kr.id_column_idx) else: raise ValueError( "The input file is neither an edge nor a node file. Key columns must be supplied." ) else: # Append columns to the list of key column indices, # silently removing duplicates, but complaining about unknown names. # # TODO: warn about duplicates? column_name: str for column_name in self.key_column_names: if column_name not in kr.column_name_map: raise ValueError("Column %s is not in the input file" % (repr(column_name))) key_idx: int = kr.column_name_map[column_name] if key_idx not in key_idx_list: key_idx_list.append(key_idx) if self.verbose: print("key indexes: %s" % " ".join([str(idx) for idx in key_idx_list]), file=self.error_file, flush=True) self.keep_first_idx_list.clear() if len(self.keep_first_names) > 0: keep_first_name: str for keep_first_name in self.keep_first_names: if keep_first_name not in kr.column_name_map: raise ValueError( "Keep first column %s is not in the input file" % (repr(keep_first_name))) keep_first_idx: int = kr.column_name_map[keep_first_name] if keep_first_idx in key_idx_list: raise ValueError( "Keep first column %s may not be a key column" % (repr(keep_first_name))) self.keep_first_idx_list.append(keep_first_idx) if self.verbose: print("keep first indexes: %s" % " ".join([str(idx) for idx in self.keep_first_idx_list]), file=self.error_file, flush=True) if self.deduplicate: if self.compact_id and kr.id_column_idx >= 0 and kr.id_column_idx not in self.keep_first_idx_list: self.keep_first_idx_list.append(kr.id_column_idx) # Any columns that aren't in the keep_first list and aren't # already in key_idx_list will be appended to key_idx_list: idx: int for idx in range(kr.column_count): if idx not in self.keep_first_idx_list and idx not in key_idx_list: key_idx_list.append(idx) if self.verbose: print("revised key indexes: %s" % " ".join([str(idx) for idx in key_idx_list]), file=self.error_file, flush=True) if self.verbose: key_idx_list_str: typing.List[str] = [] for key_idx in key_idx_list: key_idx_list_str.append(str(key_idx)) print("key indexes: %s" % " ".join(key_idx_list_str), file=self.error_file, flush=True) # Open the output file. ew: KgtkWriter = KgtkWriter.open( output_column_names, self.output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) # Open the optional list output file. lew: typing.Optional[KgtkWriter] = None if self.list_output_file_path is not None: lew = KgtkWriter.open( output_column_names, self.list_output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) input_line_count: int = 0 row: typing.List[str] = [] input_key: str prev_input_key: typing.Optional[str] = None going_up: typing.Optional[bool] = None if self.sorted_input: if self.verbose: print("Reading the input data from %s" % self.input_file_path, file=self.error_file, flush=True) for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if self.verify_sort: if prev_input_key is None: prev_input_key = input_key else: if going_up is None: if prev_input_key < input_key: going_up = True prev_input_key = input_key elif prev_input_key > input_key: going_up = False prev_input_key = input_key else: pass # No change in input key elif going_up: if prev_input_key < input_key: prev_input_key = input_key elif prev_input_key > input_key: raise ValueError( "Line %d sort violation going up: prev='%s' curr='%s'" % (input_line_count, prev_input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR), input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR))) else: pass # No change in input_key else: if prev_input_key > input_key: prev_input_key = input_key elif prev_input_key < input_key: raise ValueError( "Line %d sort violation going down: prev='%s' curr='%s'" % (input_line_count, prev_input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR), input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR))) else: pass # No change in input_key self.process_row(input_key, row, input_line_count, idb, ew, lew) else: if self.verbose: print("Sorting the input data from %s" % self.input_file_path, file=self.error_file, flush=True) # Map key values to lists of input and output data. input_map: typing.MutableMapping[ str, typing.List[typing.List[str]]] = {} for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if input_key in input_map: # Append the row to an existing list for that key. input_map[input_key].append(row) else: # Create a new list of rows for this key. input_map[input_key] = [row] if self.verbose: print("Processing the sorted input data", file=self.error_file, flush=True) for input_key in sorted(input_map.keys()): for row in input_map[input_key]: self.process_row(input_key, row, input_line_count, idb, ew, lew) # Flush the final row, if any. We pass the last row read for # feedback, such as an ID uniqueness violation. self.process_row("", row, input_line_count, idb, ew, lew, flush=True) if self.verbose: print("Read %d records, excluded %d records, wrote %d records." % (input_line_count, self.excluded_row_count, self.output_line_count), file=self.error_file, flush=True) if lew is not None: print("Wrote %d list ouput records." % (self.list_output_line_count), file=self.error_file, flush=True) ew.close() if lew is not None: lew.close()
def get_initial_namespaces(self) -> int: # Read the namespaces. If no file, use a limited internal # default. if self.namespace_file_path is None: return self.get_default_namespaces() if self.verbose: print("Processing namespace file file %s" % str(self.reject_file_path), file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.namespace_file_path, mode=KgtkReaderMode.EDGE, options=self.reader_options, error_file=self.error_file, verbose=self.verbose, very_verbose=self.very_verbose, ) namespace_line_count: int = 0 namespace_row: typing.List[str] for namespace_row in kr: namespace_line_count += 1 if namespace_row[ kr.label_column_idx] == self.prefix_expansion_label: namespace_id: str = namespace_row[kr.node1_column_idx] namespace_prefix: str = namespace_row[kr.node2_column_idx] if not (namespace_prefix.startswith('"') and namespace_prefix.endswith('"')): if self.verbose: print( "The namespace prefix must be a KGKT string: '%s'" % namespace_prefix, file=self.error_file, flush=True) continue # Strip the delimiting double quotes from the KGTk string. # Per RFC 3986, internal double quotes are not allowed in # a URL unless percent-encoded, so we needen't bother looking # for them. namespace_prefix = namespace_prefix[1:-1] if namespace_prefix in self.namespace_prefixes: if self.verbose: print("Duplicate initial namespace prefix '%s'" % namespace_prefix, file=self.error_file, flush=True) else: self.namespace_prefixes[namespace_prefix] = namespace_id if namespace_id in self.namespace_ids: if self.verbose: print("Duplicate initial namespace id '%s'" % namespace_id, file=self.error_file, flush=True) else: self.namespace_ids[namespace_id] = namespace_prefix else: if self.verbose: print("Ignoring initial namespace label '%s'" % namespace_row[kr.label_column_idx], file=self.error_file, flush=True) return namespace_line_count
def run( input_file: KGTKFiles, output_file: KGTKFiles, columns: typing.Optional[typing.List[str]] = None, labels: typing.Optional[typing.List[str]] = None, id_column_name: typing.Optional[str] = None, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally import os from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) if columns is not None: print("--columns=%s" % " ".join(columns), file=error_file) if labels is not None: print("--labels=%s" % " ".join(labels), file=error_file) if id_column_name is not None: print("--id-column=%s" % id_column_name, file=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if verbose: print("Starting normalize_nodes pid=%d" % (os.getpid()), file=error_file, flush=True) label_map: typing.MutableMapping[str, str] = dict() if labels is not None and len(labels) > 0: if columns is None: raise KGTKException( "--columns must be supplied when --labels is used.") if len(columns) != len(labels): raise KGTKException("%d columns were supplied, but %d labels." % (len(columns), len(labels))) idx: int label: str for idx, label in enumerate(labels): label_map[columns[idx]] = label try: if verbose: print("Opening the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) id_column_idx: int = kr.get_id_column_index(id_column_name) if id_column_idx < 0: raise KGTKException("Unknown ID column %s" % repr(id_column_name)) output_column_names: typing.List[str] = [ KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2 ] if verbose: print("Opening the output file: %s" % str(output_kgtk_file), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_column_names, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, verbose=verbose, very_verbose=very_verbose) input_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 node1_value: str = row[id_column_idx] column_idx: int column_name: str for column_idx, column_name in enumerate(kr.column_names): if column_idx == id_column_idx: continue if columns is not None and column_name not in columns: continue label_value: str = label_map.get(column_name, column_name) new_value: str = row[column_idx] if len(new_value) == 0: continue # ignore empty values. # The column value might contain a KGTK list. Since node2 isn't supposed # to contain lists, we'll split it. node2_value: str for node2_value in KgtkValue.split_list(new_value): if len(node2_value) == 0: continue # node2 shouldn't contain empty values output_row: typing.List[str] = [ node1_value, label_value, node2_value ] kw.write(output_row) output_line_count += 1 if verbose: print("Read %d node rows, wrote %d edge rows." % (input_line_count, output_line_count), file=error_file, flush=True) kw.close() return 0 except Exception as e: kgtk_exception_auto_handler(e) return 1
def run( input_file: KGTKFiles, output_file: KGTKFiles, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys import typing from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: # First create the KgtkReader. It provides parameters used by the ID # column builder. Next, create the ID column builder, which provides a # possibly revised list of column names for the KgtkWriter. Create # the KgtkWriter. Last, process the data stream. # Open the input file. kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) # Create the ID builder. idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options) # Open the output file. ew: KgtkWriter = KgtkWriter.open(idb.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=verbose, very_verbose=very_verbose) # Process the input file, building IDs. idb.process(kr, ew) # Clean up. ew.close() kr.close() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def run( input_file: KGTKFiles, pattern_file: KGTKFiles, output_file: KGTKFiles, reject_file: KGTKFiles, grouped_input: bool = False, reject_node1_groups: bool = False, no_complaints: bool = False, complain_immediately: bool = False, add_isa_column: bool = False, isa_column_name: str = "isa;node2", autovalidate: bool = True, errors_to_stdout: bool = False, errors_to_stderr: bool = False, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkReaderOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderMode, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.propertypatternvalidator import PropertyPatterns, PropertyPatternValidator from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) pattern_kgtk_file: Path = KGTKArgumentParser.get_input_file( pattern_file, default_stdin=False) output_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(output_file) reject_kgtk_file: typing.Optional[ Path] = KGTKArgumentParser.get_optional_output_file(reject_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--pattern-file=%s" % str(pattern_kgtk_file), file=error_file) if output_kgtk_file is not None: print("--output-file=%s" % str(output_kgtk_file), file=error_file) if reject_kgtk_file is not None: print("--reject-file=%s" % str(reject_kgtk_file), file=error_file) print("--presorted=%s" % str(grouped_input)) print("--reject-node1-groups=%s" % str(reject_node1_groups)) print("--complain-immediately=%s" % str(complain_immediately)) print("--add-isa-column=%s" % str(add_isa_column)) print("--isa-column-name=%s" % str(isa_column_name)) print("--autovalidate=%s" % str(autovalidate)) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) if verbose: print("Reading data from '%s'" % str(input_kgtk_file), file=error_file, flush=True) print("Reading patterns from '%s'" % str(pattern_kgtk_file), file=error_file, flush=True) if output_kgtk_file is not None: print("Writing good data to '%s'" % str(output_kgtk_file), file=error_file, flush=True) if reject_kgtk_file is not None: print("Writing rejected data to '%s'" % str(reject_kgtk_file), file=error_file, flush=True) try: pkr: KgtkReader = KgtkReader.open(pattern_kgtk_file, error_file=error_file, mode=KgtkReaderMode.EDGE, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose) pps: PropertyPatterns = PropertyPatterns.load( pkr, value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) kr: KgtkReader = KgtkReader.open(input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose) output_column_names: typing.List[str] = [] isa_column_idx: int = -1 if output_kgtk_file is not None: output_column_names = kr.column_names.copy() if add_isa_column: if isa_column_name in output_column_names: isa_column_idx = output_column_names.index(isa_column_name) else: isa_column_idx = len(output_column_names) output_column_names.append(isa_column_name) ppv: PropertyPatternValidator = PropertyPatternValidator.new( pps, kr, grouped_input=grouped_input, reject_node1_groups=reject_node1_groups, no_complaints=no_complaints, complain_immediately=complain_immediately, isa_column_idx=isa_column_idx, autovalidate=autovalidate, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose) kw: typing.Optional[KgtkWriter] = None if output_kgtk_file is not None: kw = KgtkWriter.open(output_column_names, output_kgtk_file, verbose=verbose, very_verbose=very_verbose) rkw: typing.Optional[KgtkWriter] = None if reject_kgtk_file is not None: rkw = KgtkWriter.open(output_column_names, reject_kgtk_file, verbose=verbose, very_verbose=very_verbose) ppv.process(kr, kw, rkw) if verbose: print("Read %d rows, %d valid" % (ppv.input_row_count, ppv.valid_row_count), file=error_file, flush=True) if kw is not None: print("Wrote %d good rows" % ppv.output_row_count, file=error_file, flush=True) if rkw is not None: print("Wrote %d rejected rows" % ppv.reject_row_count, file=error_file, flush=True) if kw is not None: kw.close() if rkw is not None: rkw.close() return 0 except Exception as e: raise KGTKException(e)
def run(input_file: KGTKFiles, output_path: str, file_prefix: str, split_by_qnode: bool, lines: int, gzipped_output: bool, errors_to_stdout: bool = False, **kwargs) -> int: import sys from pathlib import Path from kgtk.io.kgtkwriter import KgtkWriter from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions def write_files(error_file, file_number, file_prefix, kr, lines_to_write, output_path, Qnode, reader_options, split_by_qnode, suffix): if split_by_qnode: output_kgtk_file = Path(f'{output_path}/{Qnode}{suffix}') else: output_kgtk_file = Path( f'{output_path}/{file_prefix}{file_number}{suffix}') kw = KgtkWriter.open( kr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], use_mgzip=reader_options.use_mgzip, # Hack! mgzip_threads=reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=False, very_verbose=False) for r in lines_to_write: kw.write(r) kw.close() input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) suffix = ".tsv.gz" if gzipped_output else ".tsv" kr: KgtkReader = KgtkReader.open( input_kgtk_file, options=reader_options, value_options=value_options, error_file=error_file, verbose=False, very_verbose=False, ) node1_idx: int = kr.get_node1_column_index() label_idx: int = kr.get_label_column_index() node2_idx: int = kr.get_node2_column_index() if node1_idx < 0 or label_idx < 0 or node2_idx < 0: print( "Error: Not a valid file: {}. A valid edge file should have these columns: node1, label and node2" .format(input_file), file=error_file, flush=True) kr.close() raise KGTKException("Missing columns.") prev = None lines_to_write = list() file_number = 0 for row in kr: node = row[node1_idx] if node.startswith('Q') or node.startswith('P'): if prev is None: prev = node if not are_nodes_equal(prev, node): if split_by_qnode or len(lines_to_write) >= lines: write_files(error_file, file_number, file_prefix, kr, lines_to_write, output_path, prev, reader_options, split_by_qnode, suffix) lines_to_write = list() file_number += 1 prev = node lines_to_write.append(row) if len(lines_to_write) > 0: write_files(error_file, file_number, file_prefix, kr, lines_to_write, output_path, prev, reader_options, split_by_qnode, suffix) return 0
def python_sort(): if verbose: print("Opening the input file: %s" % str(input_path), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_path, options=reader_options, value_options=value_options, error_file=error_file, verbose=verbose, very_verbose=very_verbose, ) sort_idx: int key_idxs: typing.List[int] = [] if columns is not None and len(columns) > 0: # Process the list of column names, including splitting # comma-separated lists of column names. column_name: str for column_name in columns: column_name_2: str for column_name_2 in column_name.split(","): column_name_2 = column_name_2.strip() if len(column_name_2) == 0: continue if column_name_2.isdigit(): sort_idx = int(column_name_2) if sort_idx > len(kr.column_names): kr.close() cleanup() raise KGTKException( "Invalid column number %d (max %d)." % (sort_idx, len(kr.column_names))) key_idxs.append(sort_idx - 1) else: if column_name_2 not in kr.column_names: kr.close() cleanup() raise KGTKException("Unknown column_name %s" % column_name_2) key_idxs.append(kr.column_name_map[column_name_2]) else: if kr.is_node_file: key_idxs.append(kr.id_column_idx) elif kr.is_edge_file: if kr.id_column_idx >= 0: key_idxs.append(kr.id_column_idx) key_idxs.append(kr.node1_column_idx) key_idxs.append(kr.label_column_idx) key_idxs.append(kr.node2_column_idx) else: cleanup() raise KGTKException( "Unknown KGTK file mode, please specify the sorting columns." ) if verbose: print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]), file=error_file, flush=True) lines: typing.MutableMapping[str, typing.List[str]] = dict() progress_startup() key: str row: typing.List[str] for row in kr: key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx] for idx in key_idxs) lines[key] = row if verbose: print("\nRead %d data lines." % len(lines), file=error_file, flush=True) kw = KgtkWriter.open(kr.column_names, output_path, mode=KgtkWriter.Mode[kr.mode.name], verbose=verbose, very_verbose=very_verbose) for key in sorted(lines.keys()): kw.write(lines[key]) kw.close() kr.close()
def process(self): UPDATE_VERSION: str = "2020-08-24T21:47:20.256050+00:00#mr0wtMHlN/QaplDsGc/ylG3Hw5stsjziykzuGlSHBSion4xoW/Bec0sn55IQ7wFWBUClRS7g1tbAuaqEduhUVA==" if self.show_version or self.verbose: print("KgtkIfEfexists version: %s" % UPDATE_VERSION, file=self.error_file, flush=True) # Open the input files once. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) input_kr: KgtkReader = KgtkReader.open(self.input_file_path, error_file=self.error_file, who="input", options=self.input_reader_options, value_options = self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.verbose: print("Opening the filter input file: %s" % self.filter_file_path, file=self.error_file, flush=True) filter_kr: KgtkReader = KgtkReader.open(self.filter_file_path, who="filter", error_file=self.error_file, options=self.filter_reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) input_key_columns: typing.List[int] = self.get_key_columns(self.input_keys, input_kr, filter_kr, "input") filter_key_columns: typing.List[int] = self.get_key_columns(self.filter_keys, filter_kr, input_kr, "filter") if len(input_key_columns) != len(filter_key_columns): print("There are %d input key columns but %d filter key columns. Exiting." % (len(input_key_columns), len(filter_key_columns)), file=self.error_file, flush=True) return ew: typing.Optional[KgtkWriter] = None if self.output_file_path is not None: if self.verbose: print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True) ew = KgtkWriter.open(input_kr.column_names, self.output_file_path, mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) rew: typing.Optional[KgtkWriter] = None if self.reject_file_path is not None: if self.verbose: print("Opening the reject file: %s" % self.reject_file_path, file=self.error_file, flush=True) rew = KgtkWriter.open(input_kr.column_names, self.reject_file_path, mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) if self.cache_input: if self.preserve_order: self.process_cacheing_input_preserving_order(input_kr=input_kr, filter_kr=filter_kr, input_key_columns=input_key_columns, filter_key_columns=filter_key_columns, ew=ew, rew=rew) else: self.process_cacheing_input(input_kr=input_kr, filter_kr=filter_kr, input_key_columns=input_key_columns, filter_key_columns=filter_key_columns, ew=ew, rew=rew) else: self.process_cacheing_filter(input_kr=input_kr, filter_kr=filter_kr, input_key_columns=input_key_columns, filter_key_columns=filter_key_columns, ew=ew, rew=rew) if ew is not None: ew.close() if rew is not None: rew.close()
def load_property_labels_file( input_files: typing.List[str], error_file: typing.TextIO, reader_options: KgtkReaderOptions, value_options: KgtkValueOptions, label_filter: typing.List[str], verbose: bool = False, ): labels_dict: typing.MutableMapping[str, str] = {} for each_file in input_files: kr: KgtkReader = KgtkReader.open( Path(each_file), error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, ) fail: bool = False if kr.node1_column_idx < 0: fail = True print("Cannot determine which column is node1 in %s" % each_file, file=error_file, flush=True) if len(label_filter) > 0 and kr.label_column_idx < 0: fail = True print("Cannot determine which column is label in %s" % each_file, file=error_file, flush=True) if kr.node2_column_idx < 0: fail = True print("Cannot determine which column is node2 in %s" % each_file, file=error_file, flush=True) if fail: raise KGTKException("Cannot identify a required column in %s" % each_file) row: typing.List[str] for row in kr: if len(label_filter) > 0: if row[kr.label_column_idx] not in label_filter: continue node_id: str = row[kr.node1_column_idx] node_label: str = row[kr.node2_column_idx] text: str language: str language_suffix: str if node_label.startswith(("'", '"')): text, language, language_suffix = KgtkFormat.destringify( node_label) else: text = node_label language = "" language_suffix = "" # The following code will take the last-read English label, # otherwise, the first-read non-English label. if language == "en" and language_suffix == "": labels_dict[node_id] = text else: if node_id not in labels_dict: labels_dict[node_id] = node_label kr.close() return labels_dict