Beispiel #1
0
    def process(self):

        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )
        self.id_column_idx = kr.id_column_idx

        # If requested, create the ID column builder.
        # Assemble the list of output column names.
        output_column_names: typing.List[str]
        idb: typing.Optional[KgtkIdBuilder] = None
        if self.build_id:
            if self.idbuilder_options is None:
                raise ValueError(
                    "ID build requested but ID builder options are missing")
            idb = KgtkIdBuilder.new(kr, self.idbuilder_options)
            output_column_names = idb.column_names
        else:
            output_column_names = kr.column_names

        # Build the list of key column edges:
        key_idx_list: typing.List[int] = []

        if len(self.key_column_names) == 0:
            if kr.is_edge_file:
                # Add the KGTK edge file required columns.
                key_idx_list.append(kr.node1_column_idx)
                key_idx_list.append(kr.label_column_idx)
                key_idx_list.append(kr.node2_column_idx)
                if not self.compact_id and kr.id_column_idx >= 0:
                    key_idx_list.append(kr.id_column_idx)

            elif kr.is_node_file:
                # Add the KGTK node file required column:
                key_idx_list.append(kr.id_column_idx)

            else:
                raise ValueError(
                    "The input file is neither an edge nor a node file.  Key columns must be supplied."
                )

        else:
            # Append columns to the list of key column indices,
            # silently removing duplicates, but complaining about unknown names.
            #
            # TODO: warn about duplicates?
            column_name: str
            for column_name in self.key_column_names:
                if column_name not in kr.column_name_map:
                    raise ValueError("Column %s is not in the input file" %
                                     (repr(column_name)))
                key_idx: int = kr.column_name_map[column_name]
                if key_idx not in key_idx_list:
                    key_idx_list.append(key_idx)

        if self.verbose:
            print("key indexes: %s" %
                  " ".join([str(idx) for idx in key_idx_list]),
                  file=self.error_file,
                  flush=True)

        self.keep_first_idx_list.clear()
        if len(self.keep_first_names) > 0:
            keep_first_name: str
            for keep_first_name in self.keep_first_names:
                if keep_first_name not in kr.column_name_map:
                    raise ValueError(
                        "Keep first column %s is not in the input file" %
                        (repr(keep_first_name)))
                keep_first_idx: int = kr.column_name_map[keep_first_name]
                if keep_first_idx in key_idx_list:
                    raise ValueError(
                        "Keep first column %s may not be a key column" %
                        (repr(keep_first_name)))
                self.keep_first_idx_list.append(keep_first_idx)
            if self.verbose:
                print("keep first indexes: %s" %
                      " ".join([str(idx) for idx in self.keep_first_idx_list]),
                      file=self.error_file,
                      flush=True)

        if self.deduplicate:
            if self.compact_id and kr.id_column_idx >= 0 and kr.id_column_idx not in self.keep_first_idx_list:
                self.keep_first_idx_list.append(kr.id_column_idx)

            # Any columns that aren't in the keep_first list and aren't
            # already in key_idx_list will be appended to key_idx_list:
            idx: int
            for idx in range(kr.column_count):
                if idx not in self.keep_first_idx_list and idx not in key_idx_list:
                    key_idx_list.append(idx)

            if self.verbose:
                print("revised key indexes: %s" %
                      " ".join([str(idx) for idx in key_idx_list]),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            key_idx_list_str: typing.List[str] = []
            for key_idx in key_idx_list:
                key_idx_list_str.append(str(key_idx))
            print("key indexes: %s" % " ".join(key_idx_list_str),
                  file=self.error_file,
                  flush=True)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(
            output_column_names,
            self.output_file_path,
            mode=kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            use_mgzip=self.reader_options.use_mgzip,  # Hack!
            mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
            gzip_in_parallel=False,
            verbose=self.verbose,
            very_verbose=self.very_verbose)

        # Open the optional list output file.
        lew: typing.Optional[KgtkWriter] = None
        if self.list_output_file_path is not None:
            lew = KgtkWriter.open(
                output_column_names,
                self.list_output_file_path,
                mode=kr.mode,
                require_all_columns=False,
                prohibit_extra_columns=True,
                fill_missing_columns=True,
                use_mgzip=self.reader_options.use_mgzip,  # Hack!
                mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
                gzip_in_parallel=False,
                verbose=self.verbose,
                very_verbose=self.very_verbose)

        input_line_count: int = 0
        row: typing.List[str] = []
        input_key: str
        prev_input_key: typing.Optional[str] = None
        going_up: typing.Optional[bool] = None
        if self.sorted_input:
            if self.verbose:
                print("Reading the input data from %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if self.verify_sort:
                    if prev_input_key is None:
                        prev_input_key = input_key
                    else:
                        if going_up is None:
                            if prev_input_key < input_key:
                                going_up = True
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                going_up = False
                                prev_input_key = input_key
                            else:
                                pass  # No change in input key
                        elif going_up:
                            if prev_input_key < input_key:
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                raise ValueError(
                                    "Line %d sort violation going up: prev='%s' curr='%s'"
                                    % (input_line_count,
                                       prev_input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR),
                                       input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR)))
                            else:
                                pass  # No change in input_key
                        else:
                            if prev_input_key > input_key:
                                prev_input_key = input_key
                            elif prev_input_key < input_key:
                                raise ValueError(
                                    "Line %d sort violation going down: prev='%s' curr='%s'"
                                    % (input_line_count,
                                       prev_input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR),
                                       input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR)))
                            else:
                                pass  # No change in input_key

                self.process_row(input_key, row, input_line_count, idb, ew,
                                 lew)

        else:
            if self.verbose:
                print("Sorting the input data from %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            # Map key values to lists of input and output data.
            input_map: typing.MutableMapping[
                str, typing.List[typing.List[str]]] = {}

            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if input_key in input_map:
                    # Append the row to an existing list for that key.
                    input_map[input_key].append(row)
                else:
                    # Create a new list of rows for this key.
                    input_map[input_key] = [row]

            if self.verbose:
                print("Processing the sorted input data",
                      file=self.error_file,
                      flush=True)

            for input_key in sorted(input_map.keys()):
                for row in input_map[input_key]:
                    self.process_row(input_key, row, input_line_count, idb, ew,
                                     lew)

        # Flush the final row, if any.  We pass the last row read for
        # feedback, such as an ID uniqueness violation.
        self.process_row("", row, input_line_count, idb, ew, lew, flush=True)

        if self.verbose:
            print("Read %d records, excluded %d records, wrote %d records." %
                  (input_line_count, self.excluded_row_count,
                   self.output_line_count),
                  file=self.error_file,
                  flush=True)
            if lew is not None:
                print("Wrote %d list ouput records." %
                      (self.list_output_line_count),
                      file=self.error_file,
                      flush=True)

        ew.close()
        if lew is not None:
            lew.close()
Beispiel #2
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        # First create the KgtkReader.  It provides parameters used by the ID
        # column builder. Next, create the ID column builder, which provides a
        # possibly revised list of column names for the KgtkWriter.  Create
        # the KgtkWriter.  Last, process the data stream.

        # Open the input file.
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Create the ID builder.
        idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        # Process the input file, building IDs.
        idb.process(kr, ew)

        # Clean up.
        ew.close()
        kr.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Beispiel #3
0
    def process(self):

        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True)
            else:
                print("Reading the input data from stdin", file=self.error_file, flush=True)

        kr: KgtkReader =  KgtkReader.open(self.input_file_path,
                                          error_file=self.error_file,
                                          options=self.reader_options,
                                          value_options = self.value_options,
                                          verbose=self.verbose,
                                          very_verbose=self.very_verbose,
        )

        # If requested, creat the ID column builder.
        # Assemble the list of output column names.
        output_column_names: typing.List[str]
        idb: typing.Optional[KgtkIdBuilder] = None
        if self.build_id:
            if self.idbuilder_options is None:
                raise ValueError("ID build requested but ID builder options are missing")
            idb = KgtkIdBuilder.new(kr, self.idbuilder_options)
            output_column_names = idb.column_names
        else:
            output_column_names = kr.column_names

        # Build the list of key column edges:
        key_idx_list: typing.List[int] = [ ]
        if kr.is_edge_file:
            # Add the KGTK edge file required columns.
            key_idx_list.append(kr.node1_column_idx)
            key_idx_list.append(kr.label_column_idx)
            key_idx_list.append(kr.node2_column_idx)
            if not self.compact_id and kr.id_column_idx >= 0:
                key_idx_list.append(kr.id_column_idx)

        elif kr.is_node_file:
            # Add the KGTK node file required column:
            key_idx_list.append(kr.id_column_idx)

        # Append additinal columns to the list of key column indixes,
        # silently removing duplicates, but complaining about unknown names.
        #
        # TODO: warn about duplicates?
        column_name: str
        for column_name in self.key_column_names:
            if column_name not in kr.column_name_map:
                raise ValueError("Column %s is not in the input file" % (column_name))
            key_idx: int = kr.column_name_map[column_name]
            if key_idx not in key_idx_list:
                key_idx_list.append(key_idx)

        if self.verbose:
            key_idx_list_str: typing.List[str] = [ ]
            for key_idx in key_idx_list:
                key_idx_list_str.append(str(key_idx))
            print("key indexes: %s" % " ".join(key_idx_list_str))
            
        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(output_column_names,
                                         self.output_file_path,
                                         mode=kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)        
        input_line_count: int = 0
        row: typing.List[str] = [ ]
        input_key: str
        prev_input_key: typing.Optional[str] = None
        going_up: typing.Optional[bool] = None
        if self.sorted_input:
            if self.verbose:
                print("Reading the input data from %s" % self.input_file_path, file=self.error_file, flush=True)
            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if self.verify_sort:
                    if prev_input_key is None:
                        prev_input_key = input_key
                    else:
                        if going_up is None:
                            if prev_input_key < input_key:
                                going_up = True
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                going_up = False
                                prev_input_key = input_key
                            else:
                                pass # No change in input key
                        elif going_up:
                            if prev_input_key < input_key:
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                raise ValueError("Line %d sort violation going up: prev='%s' curr='%s'" % (input_line_count, prev_input_key, input_key))
                            else:
                                pass # No change in input_key
                        else:
                            if prev_input_key > input_key:
                                prev_input_key = input_key
                            elif prev_input_key < input_key:
                                raise ValueError("Line %d sort violation going down: prev='%s' curr='%s'" % (input_line_count, prev_input_key, input_key))
                            else:
                                pass # No change in input_key
                            
                self.process_row(input_key, row, input_line_count, idb, ew)
            
        else:
            if self.verbose:
                print("Sorting the input data from %s" % self.input_file_path, file=self.error_file, flush=True)
            # Map key values to lists of input and output data.
            input_map: typing.MutableMapping[str, typing.List[typing.List[str]]] = { }

            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if input_key in input_map:
                    # Append the row to an existing list for that key.
                    input_map[input_key].append(row)
                else:
                    # Create a new list of rows for this key.
                    input_map[input_key] = [ row ]

            if self.verbose:
                print("Processing the sorted input data", file=self.error_file, flush=True)
            
            for input_key in sorted(input_map.keys()):
                for row in input_map[input_key]:
                    self.process_row(input_key, row, input_line_count, idb, ew)

        # Flush the final row, if any.  We pass the last row read for
        # feedback, such as an ID uniqueness violation.
        self.process_row("", row, input_line_count, idb, ew, flush=True)
        
        if self.verbose:
            print("Read %d records, wrote %d records." % (input_line_count, self.output_line_count), file=self.error_file, flush=True)
        
        ew.close()