Exemple #1
0
    def process(self):
        kmc: KgtkMergeColumns = KgtkMergeColumns()

        # Is the output file an edge file, a node file, or unknown?
        is_edge_file: bool = False
        is_node_file: bool = False

        krs: typing.List[KgtkReader] = []
        kr: KgtkReader
        idx: int

        if self.verbose:
            print("Starting kgtkcat pid=%d" % (os.getpid()),
                  file=self.error_file,
                  flush=True)

        if self.verbose:
            print("Opening the %d input files." % len(self.input_file_paths),
                  file=self.error_file,
                  flush=True)

        saw_stdin: bool = False
        input_file_path: Path
        for idx, input_file_path in enumerate(self.input_file_paths):
            if str(input_file_path) == "-":
                if saw_stdin:
                    raise ValueError("Duplicate standard input file %d" %
                                     (idx + 1))
                else:
                    saw_stdin = False
                if self.verbose:
                    print("Opening file %d: standard input" % (idx + 1),
                          file=self.error_file,
                          flush=True)
            else:
                if self.verbose:
                    print("Opening file %d: %s" %
                          (idx + 1, str(input_file_path)),
                          file=self.error_file,
                          flush=True)

            kr = KgtkReader.open(
                input_file_path,
                who="input " + str(idx + 1),
                options=self.reader_options,
                value_options=self.value_options,
                error_file=self.error_file,
                verbose=self.verbose,
                very_verbose=self.very_verbose,
            )
            krs.append(kr)

            # Unless directed otherwise, do not merge edge files with node
            # files.  If options.mode == KgtkReaderMode.NONE, then neither
            # kr.is_edge_file nor kr.is_node_file will be set and the
            # consistency check will be skipped.
            if kr.is_edge_file:
                if is_node_file:
                    # Close the open files before raising the exception.
                    #
                    # TODO: Use a try..finally block to ensure these files are closed.
                    for kr2 in krs:
                        kr2.close()
                    raise ValueError(
                        "Cannot merge an edge file to a node file: %s" %
                        input_file_path)
                if is_edge_file == False and self.verbose:
                    print("The output file will be an edge file.",
                          file=self.error_file,
                          flush=True)
                is_edge_file = True
            elif kr.is_node_file:
                if is_edge_file:
                    # Close the open files before raising the exception.
                    #
                    # TODO: Use a try..finally block to ensure these files are closed.
                    for kr2 in krs:
                        kr2.close()
                    raise ValueError(
                        "Cannot merge a node file to an edge file: %s" %
                        input_file_path)
                if is_node_file == False and self.verbose:
                    print("The output file will be an node file.",
                          file=self.error_file,
                          flush=True)
                is_node_file = True

            if self.verbose or self.very_verbose:
                print("Mapping the %d column names in %s." %
                      (len(kr.column_names), input_file_path),
                      file=self.error_file,
                      flush=True)
            if self.very_verbose:
                print(" ".join(kr.column_names),
                      file=self.error_file,
                      flush=True)
            new_column_names: typing.List[str] = kmc.merge(kr.column_names)
            if self.very_verbose:
                print(" ".join(new_column_names),
                      file=self.error_file,
                      flush=True)

        if self.verbose or self.very_verbose:
            print("There are %d merged columns." % len(kmc.column_names),
                  file=self.error_file,
                  flush=True)
        if self.very_verbose:
            print(" ".join(kmc.column_names), file=self.error_file, flush=True)

        if self.output_column_names is not None:
            if self.verbose:
                print("There are %d new output column names." %
                      len(self.output_column_names),
                      file=self.error_file,
                      flush=True)
            if len(self.output_column_names) != len(kmc.column_names):
                # Close the open files before raising the exception.
                #
                # TODO: Use a try..finally block to ensure these files are closed.
                for kr2 in krs:
                    kr2.close()
                raise ValueError(
                    "There are %d merged columns, but %d output column names."
                    % (len(kmc.column_names), len(self.output_column_names)))

        output_mode: KgtkWriter.Mode = KgtkWriter.Mode.NONE
        if is_edge_file:
            output_mode = KgtkWriter.Mode.EDGE
            if self.verbose:
                print("Opening the output edge file: %s" %
                      str(self.output_path),
                      file=self.error_file,
                      flush=True)
        elif is_node_file:
            output_mode = KgtkWriter.Mode.NODE
            if self.verbose:
                print("Opening the output node file: %s" %
                      str(self.output_path),
                      file=self.error_file,
                      flush=True)
        else:
            if self.verbose:
                print("Opening the output file: %s" % str(self.output_path),
                      file=self.error_file,
                      flush=True)

        ew: KgtkWriter = KgtkWriter.open(
            kmc.column_names,
            self.output_path,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            use_mgzip=self.reader_options.use_mgzip,  # Hack!
            mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
            gzip_in_parallel=False,
            mode=output_mode,
            output_format=self.output_format,
            output_column_names=self.output_column_names,
            old_column_names=self.old_column_names,
            new_column_names=self.new_column_names,
            verbose=self.verbose,
            very_verbose=self.very_verbose)

        output_data_lines: int = 0
        for idx, kr in enumerate(krs):
            if kr.file_path is None:
                # This shouldn't happen because we constrined all
                # input_file_path elements to be not None.  However,
                # checking here keeps mypy happy.
                #
                # TODO: throw a better exception.
                #
                # Close the open files before raising the exception.
                #
                # TODO: Use a try..finally block to ensure these files are closed.
                for kr2 in krs:
                    kr2.close()
                raise ValueError("Missing file path.")
            input_file_path = kr.file_path
            if self.verbose:
                print("Copying data from file %d: %s" %
                      (idx + 1, input_file_path),
                      file=self.error_file,
                      flush=True)

            shuffle_list: typing.List[int] = ew.build_shuffle_list(
                kmc.new_column_name_lists[idx])

            input_data_lines: int = 0
            row: typing.List[str]
            for row in kr:
                input_data_lines += 1
                output_data_lines += 1
                ew.write(row, shuffle_list=shuffle_list)

            # Flush the output file so far:
            ew.flush()

            if self.verbose:
                print("Read %d data lines from file %d: %s" %
                      (input_data_lines, idx + 1, input_file_path),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            print("Wrote %d lines total from %d files" %
                  (output_data_lines, len(krs)),
                  file=self.error_file,
                  flush=True)

        # Close the open files.
        ew.close()
        for kr2 in krs:
            kr2.close()
Exemple #2
0
    def process(self):
        if self.verbose:
            print("Opening the left edge file: %s" % str(self.left_file_path), file=self.error_file, flush=True)
        left_kr: KgtkReader = KgtkReader.open(self.left_file_path,
                                              who="left input",
                                              options=self.left_reader_options,
                                              value_options = self.value_options,
                                              error_file=self.error_file,
                                              verbose=self.verbose,
                                              very_verbose=self.very_verbose
        )


        if self.verbose:
            print("Opening the right edge file: %s" % str(self.right_file_path), file=self.error_file, flush=True)
        right_kr: KgtkReader = KgtkReader.open(self.right_file_path,
                                               who="right input",
                                               options=self.right_reader_options,
                                               value_options = self.value_options,
                                               error_file=self.error_file,
                                               verbose=self.verbose,
                                               very_verbose=self.very_verbose
        )

        if not self.ok_to_join(left_kr, right_kr):
            left_kr.close()
            right_kr.close()
            return 1

        left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, self.LEFT, self.left_join_columns)
        right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, self.RIGHT, self.right_join_columns)
        if len(left_join_idx_list) != len(right_join_idx_list):
            print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list)), file=self.error_file, flush=True)
            left_kr.close()
            right_kr.close()
            return 1

        # This might open the input files for a second time. This won't work with stdin.
        joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets(left_join_idx_list, right_join_idx_list)

        if self.verbose:
            print("Mapping the column names for the join.", file=self.error_file, flush=True)
        kmc: KgtkMergeColumns = KgtkMergeColumns()
        kmc.merge(left_kr.column_names, prefix=self.left_prefix)
        right_column_names: typing.List[str] = kmc.merge(right_kr.column_names, prefix=self.right_prefix)
        joined_column_names: typing.List[str] = kmc.column_names

        if self.verbose:
            print("       left   columns: %s" % " ".join(left_kr.column_names), file=self.error_file, flush=True)
            print("       right  columns: %s" % " ".join(right_kr.column_names), file=self.error_file, flush=True)
            print("mapped right  columns: %s" % " ".join(right_column_names), file=self.error_file, flush=True)
            print("       joined columns: %s" % " ".join(joined_column_names), file=self.error_file, flush=True)
        
        if self.verbose:
            print("Opening the output edge file: %s" % str(self.output_path), file=self.error_file, flush=True)
        ew: KgtkWriter = KgtkWriter.open(joined_column_names,
                                         self.output_path,
                                         mode=left_kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        output_data_lines: int = 0
        left_data_lines_read: int = 0
        left_data_lines_kept: int = 0
        right_data_lines_read: int = 0
        right_data_lines_kept: int = 0
        
        if self.verbose:
            print("Processing the left input file: %s" % str(self.left_file_path), file=self.error_file, flush=True)
        row: typing.List[str]
        for row in left_kr:
            left_data_lines_read += 1
            if joined_key_set is None:
                ew.write(row)
                output_data_lines += 1
                left_data_lines_kept += 1
            else:
                left_key: str = self.build_join_key(left_kr, left_join_idx_list, row)
                if left_key in joined_key_set:
                    ew.write(row)
                    output_data_lines += 1
                    left_data_lines_kept += 1
        # Flush the output file so far:
        ew.flush()

        if self.verbose:
            print("Processing the right input file: %s" % str(self.right_file_path), file=self.error_file, flush=True)
        right_shuffle_list: typing.List[int] = ew.build_shuffle_list(right_column_names)
        for row in right_kr:
            right_data_lines_read += 1
            if joined_key_set is None:
                ew.write(row, shuffle_list=right_shuffle_list)
                output_data_lines += 1
                right_data_lines_kept += 1
            else:
                right_key: str = self.build_join_key(right_kr, right_join_idx_list, row)
                if right_key in joined_key_set:
                    ew.write(row, shuffle_list=right_shuffle_list)
                    output_data_lines += 1
                    right_data_lines_kept += 1
            
        ew.close()
        if self.verbose:
            print("The join is complete", file=self.error_file, flush=True)
            print("%d left input data lines read, %d kept" % (left_data_lines_read, left_data_lines_kept), file=self.error_file, flush=True)
            print("%d right input data lines read, %d kept" % (right_data_lines_read, right_data_lines_kept), file=self.error_file, flush=True)
            print("%d data lines written." % output_data_lines, file=self.error_file, flush=True)