コード例 #1
0
    def process(self):
        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        ikr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        # If supplied, open the label file.
        lkr: typing.Optional[KgtkReader] = None
        if self.label_file_path is not None:
            if self.verbose:
                if self.input_file_path is not None:
                    print("Opening the label file: %s" % self.label_file_path,
                          file=self.error_file,
                          flush=True)
                else:
                    print("Reading the label data from stdin",
                          file=self.error_file,
                          flush=True)

            lkr = KgtkReader.open(
                self.label_file_path,
                error_file=self.error_file,
                options=self.reader_options,
                value_options=self.value_options,
                verbose=self.verbose,
                very_verbose=self.very_verbose,
            )

        if self.lift_column_names is not None and len(self.lift_column_names) == 1 and \
           not self.suppress_empty_columns and \
           self.input_is_presorted and \
           self.labels_are_presorted and \
           lkr is not None:
            self.process_as_merge(ikr, lkr)
        else:
            self.process_in_memory(ikr, lkr)
コード例 #2
0
ファイル: unique.py プロジェクト: dguimard/kgtk
    def process(self):
        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True)
            else:
                print("Reading the input data from stdin", file=self.error_file, flush=True)

        output_columns: typing.List[str]
        if self.output_format == self.EDGE_FORMAT:
            output_columns = ["node1", "label", "node2"]

        elif self.output_format == self.NODE_ONLY_FORMAT:
            output_columns = [ "id" ]

        elif self.output_format == self.NODE_COUNTS_FORMAT:
            output_columns = [ "id", self.label_value ]

        elif self.output_format == self.NODE_FORMAT:
            output_columns = [ "id" ] # Add more later

        else:
            raise ValueError("Unknown output format %s" % str(self.output_format))
        
        kr: KgtkReader =  KgtkReader.open(self.input_file_path,
                                          error_file=self.error_file,
                                          options=self.reader_options,
                                          value_options = self.value_options,
                                          verbose=self.verbose,
                                          very_verbose=self.very_verbose,
        )

        column_idxs: typing.List[int]
        if self.column_names is None:
            if kr.node2_column_idx < 0:
                raise ValueError("No node2 default column name in the input file.")
            column_idxs = [ kr.node2_column_idx ]
        else:
            column_idxs = [ ]
            column_name: str
            for column_name in self.column_names:
                if column_name not in kr.column_name_map:
                    raise ValueError("Column %s is not in the input file" % (column_name))
                column_idxs.append(kr.column_name_map[column_name])

        where_column_idx: int = -1
        where_value_set: typing.Set[str] = { }
        if self.where_column_name is not None:
            if self.where_column_name not in kr.column_name_map:
                raise ValueError("Where column '%s' is not in the input file." % (self.where_column_name))
            where_column_idx = kr.column_name_map[self.where_column_name]
            if self.where_values is None or len(self.where_values) == 0:
                raise ValueError("Where column '%s' but no values to test." % (self.where_column_name))
            else:
                where_value_set = set(self.where_values)

        if self.presorted and self.output_format != self.NODE_FORMAT and len(column_idxs) == 1:
            self.process_presorted(output_columns, kr, column_idxs[0], where_column_idx, where_value_set)
        else:
            self.process_unsorted(output_columns, kr, column_idxs, where_column_idx, where_value_set)
コード例 #3
0
    def extract_join_key_set(
            self, file_path: Path, who: str,
            join_idx_list: typing.List[int]) -> typing.Set[str]:
        if self.verbose:
            print("Extracting the join key set from the %s input file: %s" %
                  (who, str(file_path)),
                  file=self.error_file,
                  flush=True)
        reader_options: typing.Optional[KgtkReaderOptions]
        if who == self.LEFT:
            reader_options = self.left_reader_options
        else:
            reader_options = self.right_reader_options

        kr: KgtkReader = KgtkReader.open(file_path,
                                         who=who + " input",
                                         options=reader_options,
                                         value_options=self.value_options,
                                         error_file=self.error_file,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        if len(join_idx_list) == 1:
            # This uses optimized code:
            return self.single_column_key_set(
                kr, join_idx_list[0])  # closes er file
        else:
            return self.multi_column_key_set(kr,
                                             join_idx_list)  # closes er file
コード例 #4
0
def read_metadata_file(metadata_file):
    kr: KgtkReader = KgtkReader.open(Path(metadata_file),
                                     error_file=sys.stderr,
                                     mode=KgtkReaderMode.EDGE
                                     )

    node1_idx = kr.column_name_map['node1']
    node2_idx = kr.column_name_map['node2']
    label_idx = kr.column_name_map['label']
    sorting_metadata = {}

    for row in kr:
        node1 = row[node1_idx]
        label = row[label_idx]
        node2 = row[node2_idx]

        if label == 'P7482' and node2 == 'Q108739856':
            PROFILED_PROPERTY_METADATA[node1] = 1

        if node1 not in sorting_metadata and '-' not in node1:
            sorting_metadata[node1] = dict()

        if '-' in node1:
            node1 = node1.split('-')[0]
            if label == 'datatype':
                label = 'qualifier_datatype'

        prop_val_dict = sorting_metadata.get(node1, None)
        if prop_val_dict is not None:
            prop_val_dict[label] = node2
    return sorting_metadata
コード例 #5
0
    def process(self):
        # Open the input file.
        if self.verbose:
            print("Opening the input file: %s" % str(self.input_file_path),
                  file=self.error_file,
                  flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Opening the output file: %s" % str(self.output_file_path),
                  file=self.error_file,
                  flush=True)

        # Open the output file.
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)
        # here kw has one line already where PBG doesn't need it,

        input_line_count: int = 0
        if self.verbose:
            print("Processing the input records.",
                  file=self.error_file,
                  flush=True)

        # node1 relation node2
        node1_index = kr.get_node1_column_index()
        node2_index = kr.get_node2_column_index()
        relation_index = kr.get_id_column_index('relation')

        row: typing.List[str]
        # delete header
        kw.file_out.seek(0)  # set the cursor to the top of the file
        kw.file_out.truncate()  # truncate following part == delete first line
        # print(kw.file_out.tell())

        for row in kr:
            input_line_count += 1
            kw.write([row[node1_index], row[relation_index], row[node2_index]])

        if self.verbose:
            print("Processed %d records." % (input_line_count),
                  file=self.error_file,
                  flush=True)

        kw.close()
コード例 #6
0
ファイル: kgtkwriter.py プロジェクト: karanpraharaj/kgtk
def main():
    """
    Test the KGTK edge file writer.

    TODO: full reader options.

    TODO:  --show-options
    """
    parser = ArgumentParser()
    parser.add_argument(dest="input_kgtk_file", help="The KGTK file to read", type=Path, nargs="?")
    parser.add_argument(dest="output_kgtk_file", help="The KGTK file to write", type=Path, nargs="?")
    parser.add_argument(      "--header-error-action", dest="header_error_action",
                              help="The action to take when a header error is detected  Only ERROR or EXIT are supported.",
                              type=ValidationAction, action=EnumNameAction, default=ValidationAction.EXIT)
    parser.add_argument(      "--gzip-in-parallel", dest="gzip_in_parallel", help="Execute gzip in a subthread.", action='store_true')
    parser.add_argument(      "--input-mode", dest="input_mode",
                              help="Determine the input KGTK file mode.", type=KgtkReader.Mode, action=EnumNameAction, default=KgtkReader.Mode.AUTO)
    parser.add_argument(      "--output-mode", dest="output_mode",
                              help="Determine the output KGTK file mode.", type=KgtkWriter.Mode, action=EnumNameAction, default=KgtkWriter.Mode.AUTO)
    parser.add_argument(      "--output-format", dest="output_format", help="The file format (default=kgtk)", type=str)
    parser.add_argument(      "--output-columns", dest="output_column_names", help="Rename all output columns. (default=%(default)s)", type=str, nargs='+')
    parser.add_argument(      "--old-columns", dest="old_column_names", help="Rename seleted output columns: old names. (default=%(default)s)", type=str, nargs='+')
    parser.add_argument(      "--new-columns", dest="new_column_names", help="Rename seleted output columns: new names. (default=%(default)s)", type=str, nargs='+')
    parser.add_argument("-v", "--verbose", dest="verbose", help="Print additional progress messages.", action='store_true')
    parser.add_argument(      "--very-verbose", dest="very_verbose", help="Print additional progress messages.", action='store_true')
    args = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    kr: KgtkReader = KgtkReader.open(args.input_kgtk_file,
                                     error_file=error_file,
                                     header_error_action=args.header_error_action,
                                     gzip_in_parallel=args.gzip_in_parallel,
                                     mode=args.input_mode,
                                     verbose=args.verbose, very_verbose=args.very_verbose)

    kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                     args.output_kgtk_file,
                                     error_file=error_file,
                                     gzip_in_parallel=args.gzip_in_parallel,
                                     header_error_action=args.header_error_action,
                                     mode=args.output_mode,
                                     output_format=args.output_format,
                                     output_column_names=args.output_column_names,
                                     old_column_names=args.old_column_names,
                                     new_column_names=args.new_column_names,
                                     verbose=args.verbose, very_verbose=args.very_verbose)

    line_count: int = 0
    row: typing.List[str]
    for row in kr:
        kw.write(row)
        line_count += 1
    kw.close()
    if args.verbose:
        print("Copied %d lines" % line_count, file=error_file, flush=True)
コード例 #7
0
    def process(self):
        input_kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            who="input",
            options=self.input_reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        input_key_columns: typing.List[int] = self.get_key_columns(
            input_kr, "input")
        label_col_idx = input_key_columns[1]
        label = '{}{}'.format('c', label_col_idx)

        g = load_graph_from_csv(str(input_kr.file_path),
                                not (self.undirected),
                                skip_first=not (self.no_header),
                                hashed=True,
                                csv_options={'delimiter': '\t'},
                                ecols=(input_key_columns[0],
                                       input_key_columns[2]))

        es = []
        header = ['node1', 'label', 'node2']
        if self.properties:
            properties = self.properties.split(',')
            for e in properties:
                es += (find_edge(g, g.edge_properties[label], e))
            g.clear_edges()
            g.add_edge_list(list(set(es)))
        comp, hist = label_components(g, directed=self.strong)

        ew: KgtkWriter = KgtkWriter.open(header,
                                         self.output_file_path,
                                         mode=input_kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)
        for v, c in enumerate(comp):
            ew.write([
                g.vertex_properties['name'][v], 'connected_component',
                str(c)
            ])
コード例 #8
0
ファイル: kgtkcopytemplate.py プロジェクト: yyht/kgtk
    def process(self):
        # Open the input file.
        if self.verbose:
            print("Opening the input file: %s" % str(self.input_file_path), file=self.error_file, flush=True)

        kr: KgtkReader =  KgtkReader.open(self.input_file_path,
                                          error_file=self.error_file,
                                          options=self.reader_options,
                                          value_options = self.value_options,
                                          verbose=self.verbose,
                                          very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Opening the output file: %s" % str(self.output_file_path), file=self.error_file, flush=True)
        # Open the output file.
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        input_line_count: int = 0

        if self.verbose:
            print("Processing the input records.", file=self.error_file, flush=True)

        row: typing.List[str]
        for row in kr:
            input_line_count += 1
            kw.write(row)

        if self.verbose:
            print("Processed %d records." % (input_line_count), file=self.error_file, flush=True)
        
        kw.close()
コード例 #9
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        output_format: typing.Optional[str],
        column_names: typing.List[str],
        omit_remaining_columns: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)
        if output_format is not None:
            print("--output-format=%s" % output_format,
                  file=error_file,
                  flush=True)
        print("--columns %s" % " ".join(column_names),
              file=error_file,
              flush=True)
        print("--trim=%s" % str(omit_remaining_columns),
              file=error_file,
              flush=True)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        if verbose:
            print("Opening the input file %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        remaining_names: typing.List[str] = kr.column_names.copy()
        reordered_names: typing.List[str] = []
        save_reordered_names: typing.Optional[typing.List[str]] = None

        ellipses: str = "..."  # All unmentioned columns
        ranger: str = ".."  # All columns between two columns.

        saw_ranger: bool = False
        column_name: str
        for column_name in column_names:
            if column_name == ellipses:
                if save_reordered_names is not None:
                    raise KGTKException("Elipses may appear only once")

                if saw_ranger:
                    raise KGTKException(
                        "ELipses may not appear directly after a range operator ('..')."
                    )

                save_reordered_names = reordered_names
                reordered_names = []
                continue

            if column_name == ranger:
                if len(reordered_names) == 0:
                    raise KGTKException(
                        "The column range operator ('..') may not appear without a preceeding column name."
                    )
                saw_ranger = True
                continue

            if column_name not in kr.column_names:
                raise KGTKException("Unknown column name '%s'." % column_name)
            if column_name not in remaining_names:
                raise KGTKException(
                    "Column name '%s' was duplicated in the list." %
                    column_name)

            if saw_ranger:
                saw_ranger = False
                prior_column_name: str = reordered_names[-1]
                prior_column_idx: int = kr.column_name_map[prior_column_name]
                column_name_idx: int = kr.column_name_map[column_name]
                start_idx: int
                end_idx: int
                idx_inc: int
                if column_name_idx > prior_column_idx:
                    start_idx = prior_column_idx + 1
                    end_idx = column_name_idx - 1
                    idx_inc = 1
                else:
                    start_idx = prior_column_idx - 1
                    end_idx = column_name_idx + 1
                    idx_inc = -1

                idx: int = start_idx
                while idx <= end_idx:
                    idx_column_name: str = kr.column_names[idx]
                    if idx_column_name not in remaining_names:
                        raise KGTKException(
                            "Column name '%s' (%s .. %s) was duplicated in the list."
                            % (column_name, prior_column_name, column_name))

                    reordered_names.append(idx_column_name)
                    remaining_names.remove(idx_column_name)
                    idx += idx_inc

            reordered_names.append(column_name)
            remaining_names.remove(column_name)

        if saw_ranger:
            raise KGTKException(
                "The column ranger operator ('..') may not end the list of column names."
            )

        if len(remaining_names) > 0 and save_reordered_names is None:
            # There are remaining column names and the ellipses was not seen.
            if not omit_remaining_columns:
                raise KGTKException(
                    "No ellipses, and the following columns not accounted for: %s"
                    % " ".join(remaining_names))
            else:
                if verbose:
                    print("Omitting the following columns: %s" %
                          " ".join(remaining_names),
                          file=error_file,
                          flush=True)
        if save_reordered_names is not None:
            if len(remaining_names) > 0:
                save_reordered_names.extend(remaining_names)
            if len(reordered_names) > 0:
                save_reordered_names.extend(reordered_names)
            reordered_names = save_reordered_names

        if verbose:
            print("Opening the output file %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            reordered_names,
            output_kgtk_file,
            require_all_columns=True,
            prohibit_extra_columns=True,
            fill_missing_columns=False,
            gzip_in_parallel=False,
            mode=KgtkWriter.Mode[kr.mode.name],
            output_format=output_format,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        shuffle_list: typing.List = kw.build_shuffle_list(kr.column_names)

        input_data_lines: int = 0
        row: typing.List[str]
        for row in kr:
            input_data_lines += 1
            kw.write(row, shuffle_list=shuffle_list)

        # Flush the output file so far:
        kw.flush()

        if verbose:
            print("Read %d data lines from file %s" %
                  (input_data_lines, input_kgtk_file),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
コード例 #10
0
ファイル: replace-nodes.py プロジェクト: nicklein/kgtk
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        mapping_file: KGTKFiles,
        unmodified_edges_file: KGTKFiles,
        activated_mapping_file: KGTKFiles,
        rejected_mapping_file: KGTKFiles,

        confidence_column_name: str,
        require_confidence: bool,
        default_confidence_str: typing.Optional[str],
        confidence_threshold: float,

        same_as_item_label: str,
        same_as_property_label: str,
        allow_exact_duplicates: bool,
        allow_idempotent_mapping: bool,

        split_output_mode: bool,
        modified_pattern: str,

        node1_column_name: typing.Optional[str],
        label_column_name: typing.Optional[str],
        node2_column_name: typing.Optional[str],
        mapping_rule_mode: str,
        mapping_node1_column_name: typing.Optional[str],
        mapping_label_column_name: typing.Optional[str],
        mapping_node2_column_name: typing.Optional[str],

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    mapping_kgtk_file: Path = KGTKArgumentParser.get_input_file(mapping_file, who="KGTK mappping file")
    unmodified_edges_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_edges_file, who="KGTK unmodified edges output file")
    activated_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(activated_mapping_file, who="KGTK activated mapping output file")
    rejected_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(rejected_mapping_file, who="KGTK rejected mapping output file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True)
    mapping_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="mapping", fallback=True)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % repr(str(input_kgtk_file)), file=error_file, flush=True)
        print("--output-file=%s" % repr(str(output_kgtk_file)), file=error_file, flush=True)
        print("--mapping-file=%s" % repr(str(mapping_kgtk_file)), file=error_file, flush=True)
        if unmodified_edges_kgtk_file is not None:
            print("--unmodified-edges-file=%s" % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True)
        if activated_mapping_kgtk_file is not None:
            print("--activated-mapping-edges-file=%s" % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True)
        if rejected_mapping_kgtk_file is not None:
            print("--rejected-mapping-edges-file=%s" % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True)

        print("--confidence-column=%s" % repr(confidence_column_name), file=error_file, flush=True)
        print("--require-confidence=%s" % repr(require_confidence), file=error_file, flush=True)
        if default_confidence_str is not None:
            print("--default-confidence-value=%s" % default_confidence_str, file=error_file, flush=True)
        print("--threshold=%f" % confidence_threshold, file=error_file, flush=True)

        print("--same-as-item-label=%s" % repr(same_as_item_label), file=error_file, flush=True)
        print("--same-as-property-label=%s" % repr(same_as_property_label), file=error_file, flush=True)
        print("--allow-exact-duplicates=%s" % repr(allow_exact_duplicates), file=error_file, flush=True)
        print("--allow-idempotent-actions=%s" % repr(allow_idempotent_mapping), file=error_file, flush=True)

        print("--split-output-mode=%s" % repr(split_output_mode), file=error_file, flush=True)
        print("--modified-pattern=%s" % repr(modified_pattern), file=error_file, flush=True)

        if node1_column_name is not None:
            print("--node1-column-=%s" % repr(node1_column_name), file=error_file, flush=True)
        if label_column_name is not None:
            print("--label-column-=%s" % repr(label_column_name), file=error_file, flush=True)
        if node2_column_name is not None:
            print("--node2-column-=%s" % repr(node2_column_name), file=error_file, flush=True)
        print("--mapping-rule-mode=%s" % repr(mapping_rule_mode), file=error_file, flush=True)
        if mapping_node1_column_name is not None:
            print("--mapping-node1-column-=%s" % repr(mapping_node1_column_name), file=error_file, flush=True)
        if mapping_label_column_name is not None:
            print("--mapping-label-column-=%s" % repr(mapping_label_column_name), file=error_file, flush=True)
        if mapping_node2_column_name is not None:
            print("--mapping-node2-column-=%s" % repr(mapping_node2_column_name), file=error_file, flush=True)

        input_reader_options.show(out=error_file, who="input")
        mapping_reader_options.show(out=error_file, who="mapping")
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    default_confidence_value: typing.Optional[float] = None
    if default_confidence_str is not None:
        try:
            default_confidence_value = float(default_confidence_str)
        except:
            raise KGTKException("--default-confidence-value=%s is invalid" % repr(default_confidence_str))

    try:

        if verbose:
            print("Opening the mapping file %s." % repr(str(mapping_kgtk_file)), file=error_file, flush=True)
        mkr:  KgtkReader = KgtkReader.open(mapping_kgtk_file,
                                           options=mapping_reader_options,
                                           value_options = value_options,
                                           error_file=error_file,
                                           verbose=verbose,
                                           very_verbose=very_verbose,
        )
        trouble = False
        mapping_node1_idx: int = mkr.get_node1_column_index(mapping_node1_column_name)
        mapping_label_idx: int = mkr.get_label_column_index(mapping_label_column_name)
        mapping_node2_idx: int = mkr.get_node2_column_index(mapping_node2_column_name)
        if mapping_node1_idx < 0:
            trouble = True
            print("Error: Cannot find the mapping file node1 column.", file=error_file, flush=True)
        if mapping_label_idx < 0 and mapping_rule_mode == "normal":
            trouble = True
            print("Error: Cannot find the mapping file label column.", file=error_file, flush=True)
        if mapping_node2_idx < 0:
            trouble = True
            print("Error: Cannot find the mapping file node2 column.", file=error_file, flush=True)
        if trouble:
            # Clean up:                                                                                                                                               
            mkr.close()
            raise KGTKException("Missing columns in the mapping file.")
        confidence_column_idx: int = mkr.column_name_map.get(confidence_column_name, -1)
        if require_confidence and confidence_column_idx < 0:
            mkr.close()
            raise KGTKException("The mapping file does not have a confidence column, and confidence is required.")
        
        rmkw: typing.Optional[KgtkWriter] = None
        if rejected_mapping_kgtk_file is not None:
            if verbose:
                print("Opening the rejected mapping edges file %s." % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True)
            rmkw = KgtkWriter.open(mkr.column_names,
                                   rejected_mapping_kgtk_file,
                                   mode=KgtkWriter.Mode[mkr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        # Mapping structures:
        item_map: typing.MutableMapping[str, str] = dict()
        item_line_map: typing.MutableMapping[str, int] = dict()
        property_map: typing.MutableMapping[str, str] = dict()
        property_line_map: typing.MutableMapping[str, int] = dict()

        mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict()
        activated_mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict()

        # Read the mapping file.
        if verbose:
            print("Processing the mapping file.", file=error_file, flush=True)
        mapping_confidence_exclusions: int = 0
        mapping_idempotent_exclusions: int = 0
        mapping_errors: int = 0
        mapping_line_number: int = 0
        mrow: typing.List[str]
        for mrow in mkr:
            mapping_line_number += 1
            mapping_node1: str = mrow[mapping_node1_idx]
            mapping_label: str = mrow[mapping_label_idx] if mapping_rule_mode == "normal" else ""
            mapping_node2: str = mrow[mapping_node2_idx]
            mapping_confidence: typing.Optional[float] = default_confidence_value
            if confidence_column_idx >= 0:
                confidence_value_str: str = mrow[confidence_column_idx]
                if len(confidence_value_str) == 0:
                    if require_confidence:
                        print("In line %d of the mapping file: the required confidence value is missing" % (mapping_line_number),
                              file=error_file, flush=True)
                        mapping_errors += 1
                        continue
                else:
                    try:
                        mapping_confidence = float(confidence_value_str)
                    except ValueError:
                        print("In line %d of the mapping file: cannot parse confidence value %s" % (mapping_line_number, repr(mrow[confidence_column_idx])),
                              file=error_file, flush=True)
                        mapping_errors += 1
                        continue
            if mapping_confidence is not None and mapping_confidence < confidence_threshold:
                mapping_confidence_exclusions += 1
                if rmkw is not None:
                    rmkw.write(mrow)
                continue

            if mapping_node1 == mapping_node2 and not allow_idempotent_mapping:
                mapping_idempotent_exclusions += 1
                continue
        
            if mapping_rule_mode == "same-as-item" or mapping_label == same_as_item_label:
                if mapping_node1 in item_map:
                    if mapping_node2 != item_map[mapping_node1] or not allow_exact_duplicates:
                        print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label,
                                                                                                      repr(mapping_node1),
                                                                                                      mapping_line_number,
                                                                                                      item_line_map[mapping_node1]),
                              file=error_file, flush=True)
                        mapping_errors += 1
                    continue

                item_map[mapping_node1] = mapping_node2
                item_line_map[mapping_node1] = mapping_line_number
                mapping_rows[mapping_line_number] = mrow.copy()

            elif mapping_rule_mode == "same-as-property" or mapping_label == same_as_property_label:
                if mapping_node1 in property_map:
                    if mapping_node2 != property_map[mapping_node1] or not allow_exact_duplicates:
                        print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label,
                                                                                                      repr(mapping_node1),
                                                                                                      mapping_line_number,
                                                                                                      property_line_map[mapping_node1]),
                              file=error_file, flush=True)
                        mapping_errors += 1
                    continue
                property_map[mapping_node1] = mapping_node2
                property_line_map[mapping_node1] = mapping_line_number
                mapping_rows[mapping_line_number] = mrow.copy()

            else:
                print("Unknown mapping action %s at line %d of mapping file %s" % (mapping_label,
                                                                                   mapping_line_number,
                                                                                   repr(str(mapping_kgtk_file))),
                      file=error_file, flush=True)
                mapping_errors += 1
                continue
                

        # Close the mapping file.
        mkr.close()
        if rmkw is not None:
            rmkw.close()

        if mapping_errors > 0:
            raise KGTKException("%d errors detected in the mapping file %s" % (mapping_errors, repr(str(mapping_kgtk_file))))

        if len(item_map) == 0 and len(property_map) == 0:
            raise KGTKException("Nothing read from the mapping file %s" % repr(str(mapping_kgtk_file)))

        if verbose:
            print("%d mapping lines, %d excluded for confidence, %d excluded for idempotency." % (mapping_line_number,
                                                                                                  mapping_confidence_exclusions,
                                                                                                  mapping_idempotent_exclusions),
                  file=error_file, flush=True)
            print("%d item mapping rules." % len(item_map), file=error_file, flush=True)
            print("%d property mapping rules." % len(property_map), file=error_file, flush=True)

        if verbose:
            print("Opening the input file %s." % repr(str(input_kgtk_file)), file=error_file, flush=True)
        ikr:  KgtkReader = KgtkReader.open(input_kgtk_file,
                                           options=input_reader_options,
                                           value_options = value_options,
                                           error_file=error_file,
                                           verbose=verbose,
                                           very_verbose=very_verbose,
        )
        trouble = False
        input_node1_idx: int = ikr.get_node1_column_index(node1_column_name)
        input_label_idx: int = ikr.get_label_column_index(label_column_name)
        input_node2_idx: int = ikr.get_node2_column_index(node2_column_name)
        if input_node1_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]:
            trouble = True
            print("Error: Cannot find the input file node1 column.", file=error_file, flush=True)
        if input_label_idx < 0 and mapping_rule_mode in ["normal", "same-as-property"]:
            trouble = True
            print("Error: Cannot find the input file label column.", file=error_file, flush=True)
        if input_node2_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]:
            trouble = True
            print("Error: Cannot find the input file node2 column.", file=error_file, flush=True)
        if trouble:
            # Clean up:                                                                                                                                               
            ikr.close()
            raise KGTKException("Missing columns in the input file.")

        okw: KgtkWriter = KgtkWriter.open(ikr.column_names,
                                          output_kgtk_file,
                                          mode=KgtkWriter.Mode[ikr.mode.name],
                                          use_mgzip=input_reader_options.use_mgzip, # Hack!
                                          mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                          error_file=error_file,
                                          verbose=verbose,
                                          very_verbose=very_verbose)

        uekw: typing.Optional[KgtkWriter] = None
        if unmodified_edges_kgtk_file is not None:
            if verbose:
                print("Opening the unmodified edges file %s." % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True)
            uekw = KgtkWriter.open(ikr.column_names,
                                   unmodified_edges_kgtk_file,
                                   mode=KgtkWriter.Mode[ikr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        amkw: typing.Optional[KgtkWriter] = None
        if activated_mapping_kgtk_file is not None:
            if verbose:
                print("Opening the activated mapping edges file %s." % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True)
            amkw = KgtkWriter.open(mkr.column_names,
                                   activated_mapping_kgtk_file,
                                   mode=KgtkWriter.Mode[mkr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        # Process each row of the input file.
        if verbose:
            print("Processing the input file.", file=error_file, flush=True)
        input_count: int = 0
        modified_edge_count: int = 0
        unmodified_edge_count: int = 0
        row: typing.List[str]
        for row in ikr:
            input_count +=1
            newrow: typing.List[str] = row.copy()

            modified_node1: bool = False
            modified_node2: bool = False
            modified_label: bool = False

            if mapping_rule_mode in ["normal", "same-as-item"]:
                input_node1: str = row[input_node1_idx]
                if input_node1 in item_map:
                    newrow[input_node1_idx] = item_map[input_node1]
                    modified_node1 = True
                    if amkw is not None:
                        mapping_line_number = item_line_map[input_node1]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]
                        
                input_node2: str = row[input_node2_idx]
                if input_node2 in item_map:
                    newrow[input_node2_idx] = item_map[input_node2]
                    modified_node2 = True
                    if amkw is not None:
                        mapping_line_number = item_line_map[input_node2]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]

            if mapping_rule_mode in ["normal", "same-as-property"]:
                input_label: str = row[input_label_idx]
                if input_label in property_map:
                    newrow[input_label_idx] = property_map[input_label]
                    modified_label = True
                    if amkw is not None:
                        mapping_line_number = property_line_map[input_label]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]

            modified: bool
            if modified_pattern == "node1|label|node2":
                modified = modified_node1 or modified_label or modified_node2
            elif modified_pattern == "node1|label":
                modified = modified_node1 or modified_label
            elif modified_pattern == "node1|node2":
                modified = modified_node1 or modified_node2
            elif modified_pattern == "label|node2":
                modified = modified_label or modified_node2
            elif modified_pattern == "node1":
                modified = modified_node1
            elif modified_pattern == "label":
                modified = modified_label
            elif modified_pattern == "node2":
                modified = modified_node2
            elif modified_pattern == "node1&label&node2":
                modified = modified_node1 and modified_label and modified_node2
            elif modified_pattern == "node1&label":
                modified = modified_node1 and modified_label
            elif modified_pattern == "node1&node2":
                modified = modified_node1 and modified_node2
            elif modified_pattern == "label&node2":
                modified = modified_label and modified_node2
            else:
                raise KGTKException("Unrecognized modification test pattern %s" % repr(modified_pattern))                

            if modified:
                modified_edge_count += 1
                okw.write(newrow)
            else:
                unmodified_edge_count += 1
                if uekw is not None:
                    uekw.write(row)
                if not split_output_mode:
                    okw.write(row)
                        
        # Done!
        ikr.close()
        okw.close()

        if verbose:
            print("%d edges read. %d modified, %d unmodified." % (input_count, modified_edge_count, unmodified_edge_count), file=error_file, flush=True)

        if uekw is not None:
            uekw.close()

        if amkw is not None:
            activated_count: int = 0
            for mapping_line_number in sorted(activated_mapping_rows.keys()):
                amkw.write(activated_mapping_rows[mapping_line_number])
                activated_count += 1
            amkw.close()

            if verbose:
                print("%d activated mapping edges" % activated_count, file=error_file, flush=True)

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
コード例 #11
0
def run(
        input_file: KGTKFiles,
        path_file: KGTKFiles,
        output_file: KGTKFiles,
        statistics_only: bool,
        undirected: bool,
        max_hops: int,
        source_column_name: typing.Optional[str],
        target_column_name: typing.Optional[str],
        shortest_path: bool,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool.all import find_vertex
    from graph_tool.topology import all_paths
    from graph_tool.topology import all_shortest_paths

    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    from kgtk.exceptions import KGTKException
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="input", fallback=True)
        path_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="path", fallback=True)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        path_kgtk_file: Path = KGTKArgumentParser.get_input_file(path_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        id_col = 'name'

        if verbose:
            print("Reading the path file: %s" % str(path_kgtk_file),
                  file=error_file,
                  flush=True)
        pairs = []
        pkr: KgtkReader = KgtkReader.open(
            path_kgtk_file,
            error_file=error_file,
            options=path_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        path_source_idx: int = pkr.get_node1_column_index(source_column_name)
        if path_source_idx < 0:
            print("Missing node1 (source) column name in the path file.",
                  file=error_file,
                  flush=True)

        path_target_idx: int = pkr.get_node2_column_index(target_column_name)
        if path_target_idx < 0:
            print("Missing node1 (target) column name in the path file.",
                  file=error_file,
                  flush=True)
        if path_source_idx < 0 or path_target_idx < 0:
            pkr.close()
            raise KGTKException("Exiting due to missing columns.")

        paths_read: int = 0
        path_row: typing.List[str]
        for path_row in pkr:
            paths_read += 1
            if len(path_row) != pkr.column_count:
                raise KGTKException(
                    "Exiting because line %d in the path file (%s) is the wrong length: %d columns expected, %d were read."
                    % (paths_read, str(path_kgtk_file), pkr.column_count,
                       len(path_row)))
            src: str = path_row[path_source_idx]
            tgt: str = path_row[path_target_idx]
            pairs.append((src, tgt))
        pkr.close()
        if verbose:
            print("%d path rows read" % paths_read,
                  file=error_file,
                  flush=True)
        if len(pairs) == 0:
            print("No path pairs found, the output will be empty.",
                  file=error_file,
                  flush=True)
        elif verbose:
            print("%d path pairs found" % len(pairs),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Reading the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=input_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sub_index: int = kr.get_node1_column_index()
        if sub_index < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred_index: int = kr.get_label_column_index()
        if pred_index < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj_index: int = kr.get_node2_column_index()
        if obj_index < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        id_index: int = kr.get_id_column_index()
        if id_index < 0:
            print("Missing id column", file=error_file, flush=True)
        if sub_index < 0 or pred_index < 0 or obj_index < 0 or id_index < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred_index]
        id_col_name: str = kr.column_names[id_index]

        G = load_graph_from_kgtk(kr,
                                 directed=not undirected,
                                 ecols=(sub_index, obj_index),
                                 verbose=verbose,
                                 out=error_file)

        output_columns: typing.List[str] = ['node1', 'label', 'node2', 'id']
        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        id_count = 0
        if not statistics_only:
            for e in G.edges():
                sid, oid = e
                lbl = G.ep[predicate][e]
                kw.write([
                    G.vp[id_col][sid], lbl, G.vp[id_col][oid],
                    '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1
            if verbose:
                print("%d edges found." % id_count,
                      file=error_file,
                      flush=True)

        id_count = 0
        path_id = 0
        for pair in pairs:
            source_node, target_node = pair
            source_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=source_node)
            target_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=target_node)
            if len(source_ids) == 1 and len(target_ids) == 1:
                source_id = source_ids[0]
                target_id = target_ids[0]
                if shortest_path:
                    _all_paths = all_shortest_paths(G,
                                                    source_id,
                                                    target_id,
                                                    edges=True)
                else:
                    _all_paths = all_paths(G,
                                           source_id,
                                           target_id,
                                           cutoff=max_hops,
                                           edges=True)

                for path in _all_paths:
                    for edge_num, an_edge in enumerate(path):
                        edge_id = G.properties[('e', 'id')][an_edge]
                        node1: str = 'p%d' % path_id
                        kw.write([
                            node1,
                            str(edge_num), edge_id,
                            '{}-{}-{}'.format(node1, edge_num, id_count)
                        ])
                        id_count += 1
                    path_id += 1

        if verbose:
            print("%d paths contining %d edges found." % (path_id, id_count),
                  file=error_file,
                  flush=True)

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
コード例 #12
0
    def process(self):
        kmc: KgtkMergeColumns = KgtkMergeColumns()

        # Is the output file an edge file, a node file, or unknown?
        is_edge_file: bool = False
        is_node_file: bool = False

        krs: typing.List[KgtkReader] = []
        kr: KgtkReader
        idx: int

        if self.verbose:
            print("Starting kgtkcat pid=%d" % (os.getpid()),
                  file=self.error_file,
                  flush=True)

        if self.verbose:
            print("Opening the %d input files." % len(self.input_file_paths),
                  file=self.error_file,
                  flush=True)

        saw_stdin: bool = False
        input_file_path: Path
        for idx, input_file_path in enumerate(self.input_file_paths):
            if str(input_file_path) == "-":
                if saw_stdin:
                    raise ValueError("Duplicate standard input file %d" %
                                     (idx + 1))
                else:
                    saw_stdin = False
                if self.verbose:
                    print("Opening file %d: standard input" % (idx + 1),
                          file=self.error_file,
                          flush=True)
            else:
                if self.verbose:
                    print("Opening file %d: %s" %
                          (idx + 1, str(input_file_path)),
                          file=self.error_file,
                          flush=True)

            kr = KgtkReader.open(
                input_file_path,
                who="input " + str(idx + 1),
                options=self.reader_options,
                value_options=self.value_options,
                error_file=self.error_file,
                verbose=self.verbose,
                very_verbose=self.very_verbose,
            )
            krs.append(kr)

            # Unless directed otherwise, do not merge edge files with node
            # files.  If options.mode == KgtkReaderMode.NONE, then neither
            # kr.is_edge_file nor kr.is_node_file will be set and the
            # consistency check will be skipped.
            if kr.is_edge_file:
                if is_node_file:
                    # Close the open files before raising the exception.
                    #
                    # TODO: Use a try..finally block to ensure these files are closed.
                    for kr2 in krs:
                        kr2.close()
                    raise ValueError(
                        "Cannot merge an edge file to a node file: %s" %
                        input_file_path)
                if is_edge_file == False and self.verbose:
                    print("The output file will be an edge file.",
                          file=self.error_file,
                          flush=True)
                is_edge_file = True
            elif kr.is_node_file:
                if is_edge_file:
                    # Close the open files before raising the exception.
                    #
                    # TODO: Use a try..finally block to ensure these files are closed.
                    for kr2 in krs:
                        kr2.close()
                    raise ValueError(
                        "Cannot merge a node file to an edge file: %s" %
                        input_file_path)
                if is_node_file == False and self.verbose:
                    print("The output file will be an node file.",
                          file=self.error_file,
                          flush=True)
                is_node_file = True

            if self.verbose or self.very_verbose:
                print("Mapping the %d column names in %s." %
                      (len(kr.column_names), input_file_path),
                      file=self.error_file,
                      flush=True)
            if self.very_verbose:
                print(" ".join(kr.column_names),
                      file=self.error_file,
                      flush=True)
            new_column_names: typing.List[str] = kmc.merge(kr.column_names)
            if self.very_verbose:
                print(" ".join(new_column_names),
                      file=self.error_file,
                      flush=True)

        if self.verbose or self.very_verbose:
            print("There are %d merged columns." % len(kmc.column_names),
                  file=self.error_file,
                  flush=True)
        if self.very_verbose:
            print(" ".join(kmc.column_names), file=self.error_file, flush=True)

        if self.output_column_names is not None:
            if self.verbose:
                print("There are %d new output column names." %
                      len(self.output_column_names),
                      file=self.error_file,
                      flush=True)
            if len(self.output_column_names) != len(kmc.column_names):
                # Close the open files before raising the exception.
                #
                # TODO: Use a try..finally block to ensure these files are closed.
                for kr2 in krs:
                    kr2.close()
                raise ValueError(
                    "There are %d merged columns, but %d output column names."
                    % (len(kmc.column_names), len(self.output_column_names)))

        output_mode: KgtkWriter.Mode = KgtkWriter.Mode.NONE
        if is_edge_file:
            output_mode = KgtkWriter.Mode.EDGE
            if self.verbose:
                print("Opening the output edge file: %s" %
                      str(self.output_path),
                      file=self.error_file,
                      flush=True)
        elif is_node_file:
            output_mode = KgtkWriter.Mode.NODE
            if self.verbose:
                print("Opening the output node file: %s" %
                      str(self.output_path),
                      file=self.error_file,
                      flush=True)
        else:
            if self.verbose:
                print("Opening the output file: %s" % str(self.output_path),
                      file=self.error_file,
                      flush=True)

        ew: KgtkWriter = KgtkWriter.open(
            kmc.column_names,
            self.output_path,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            use_mgzip=self.reader_options.use_mgzip,  # Hack!
            mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
            gzip_in_parallel=False,
            mode=output_mode,
            output_format=self.output_format,
            output_column_names=self.output_column_names,
            old_column_names=self.old_column_names,
            new_column_names=self.new_column_names,
            verbose=self.verbose,
            very_verbose=self.very_verbose)

        output_data_lines: int = 0
        for idx, kr in enumerate(krs):
            if kr.file_path is None:
                # This shouldn't happen because we constrined all
                # input_file_path elements to be not None.  However,
                # checking here keeps mypy happy.
                #
                # TODO: throw a better exception.
                #
                # Close the open files before raising the exception.
                #
                # TODO: Use a try..finally block to ensure these files are closed.
                for kr2 in krs:
                    kr2.close()
                raise ValueError("Missing file path.")
            input_file_path = kr.file_path
            if self.verbose:
                print("Copying data from file %d: %s" %
                      (idx + 1, input_file_path),
                      file=self.error_file,
                      flush=True)

            shuffle_list: typing.List[int] = ew.build_shuffle_list(
                kmc.new_column_name_lists[idx])

            input_data_lines: int = 0
            row: typing.List[str]
            for row in kr:
                input_data_lines += 1
                output_data_lines += 1
                ew.write(row, shuffle_list=shuffle_list)

            # Flush the output file so far:
            ew.flush()

            if self.verbose:
                print("Read %d data lines from file %d: %s" %
                      (input_data_lines, idx + 1, input_file_path),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            print("Wrote %d lines total from %d files" %
                  (output_data_lines, len(krs)),
                  file=self.error_file,
                  flush=True)

        # Close the open files.
        ew.close()
        for kr2 in krs:
            kr2.close()
コード例 #13
0
    def python_sort():
        if numeric_columns is not None and len(numeric_columns) > 0:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support numeric column sorts.'
            )

        if reverse_columns is not None and len(reverse_columns) > 0:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support reverse column sorts.'
            )

        if verbose:
            print("Opening the input file: %s" % str(input_path),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_path,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sort_idx: int
        key_idxs: typing.List[int] = []
        if columns is not None and len(columns) > 0:
            # Process the list of column names, including splitting
            # comma-separated lists of column names.
            column_name: str
            for column_name in columns:
                column_name_2: str
                for column_name_2 in column_name.split(","):
                    column_name_2 = column_name_2.strip()
                    if len(column_name_2) == 0:
                        continue
                    if column_name_2.isdigit():
                        sort_idx = int(column_name_2)
                        if sort_idx > len(kr.column_names):
                            kr.close()
                            cleanup()
                            raise KGTKException(
                                "Invalid column number %d (max %d)." %
                                (sort_idx, len(kr.column_names)))
                        key_idxs.append(sort_idx - 1)
                    else:
                        if column_name_2 not in kr.column_names:
                            kr.close()
                            cleanup()
                            raise KGTKException("Unknown column_name %s" %
                                                column_name_2)
                        key_idxs.append(kr.column_name_map[column_name_2])
        else:
            if kr.is_node_file:
                key_idxs.append(kr.id_column_idx)

            elif kr.is_edge_file:
                if kr.id_column_idx >= 0:
                    key_idxs.append(kr.id_column_idx)

                key_idxs.append(kr.node1_column_idx)
                key_idxs.append(kr.label_column_idx)
                key_idxs.append(kr.node2_column_idx)
            else:
                cleanup()
                raise KGTKException(
                    "Unknown KGTK file mode, please specify the sorting columns."
                )

        if verbose:
            print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]),
                  file=error_file,
                  flush=True)

        if numeric_sort and len(key_idxs) > 1:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support numeric sorts on multiple columns.'
            )

        lines: typing.MutableMapping[typing.Union[str, float],
                                     typing.List[typing.List[str]]] = dict()

        progress_startup()
        key: typing.Union[str, float]
        row: typing.List[str]
        for row in kr:
            key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx]
                                                      for idx in key_idxs)
            if numeric_sort:
                key = float(key)
            if key in lines:
                # There are multiple rows with the same key.  Make this a stable sort.
                lines[key].append(row)
            else:
                lines[key] = [row]
        if verbose:
            print("\nRead %d data lines." % len(lines),
                  file=error_file,
                  flush=True)

        kw = KgtkWriter.open(kr.column_names,
                             output_path,
                             mode=KgtkWriter.Mode[kr.mode.name],
                             verbose=verbose,
                             very_verbose=very_verbose)

        for key in sorted(lines.keys(), reverse=reverse_sort):
            for row in lines[key]:
                kw.write(row)

        kw.close()
        kr.close()
コード例 #14
0
    def process(self):
        if self.verbose:
            print("Opening the left edge file: %s" % str(self.left_file_path), file=self.error_file, flush=True)
        left_kr: KgtkReader = KgtkReader.open(self.left_file_path,
                                              who="left input",
                                              options=self.left_reader_options,
                                              value_options = self.value_options,
                                              error_file=self.error_file,
                                              verbose=self.verbose,
                                              very_verbose=self.very_verbose
        )


        if self.verbose:
            print("Opening the right edge file: %s" % str(self.right_file_path), file=self.error_file, flush=True)
        right_kr: KgtkReader = KgtkReader.open(self.right_file_path,
                                               who="right input",
                                               options=self.right_reader_options,
                                               value_options = self.value_options,
                                               error_file=self.error_file,
                                               verbose=self.verbose,
                                               very_verbose=self.very_verbose
        )

        if not self.ok_to_join(left_kr, right_kr):
            left_kr.close()
            right_kr.close()
            return 1

        left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, self.LEFT, self.left_join_columns)
        right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, self.RIGHT, self.right_join_columns)
        if len(left_join_idx_list) != len(right_join_idx_list):
            print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list)), file=self.error_file, flush=True)
            left_kr.close()
            right_kr.close()
            return 1

        # This might open the input files for a second time. This won't work with stdin.
        joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets(left_join_idx_list, right_join_idx_list)

        if self.verbose:
            print("Mapping the column names for the join.", file=self.error_file, flush=True)
        kmc: KgtkMergeColumns = KgtkMergeColumns()
        kmc.merge(left_kr.column_names, prefix=self.left_prefix)
        right_column_names: typing.List[str] = kmc.merge(right_kr.column_names, prefix=self.right_prefix)
        joined_column_names: typing.List[str] = kmc.column_names

        if self.verbose:
            print("       left   columns: %s" % " ".join(left_kr.column_names), file=self.error_file, flush=True)
            print("       right  columns: %s" % " ".join(right_kr.column_names), file=self.error_file, flush=True)
            print("mapped right  columns: %s" % " ".join(right_column_names), file=self.error_file, flush=True)
            print("       joined columns: %s" % " ".join(joined_column_names), file=self.error_file, flush=True)
        
        if self.verbose:
            print("Opening the output edge file: %s" % str(self.output_path), file=self.error_file, flush=True)
        ew: KgtkWriter = KgtkWriter.open(joined_column_names,
                                         self.output_path,
                                         mode=left_kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        output_data_lines: int = 0
        left_data_lines_read: int = 0
        left_data_lines_kept: int = 0
        right_data_lines_read: int = 0
        right_data_lines_kept: int = 0
        
        if self.verbose:
            print("Processing the left input file: %s" % str(self.left_file_path), file=self.error_file, flush=True)
        row: typing.List[str]
        for row in left_kr:
            left_data_lines_read += 1
            if joined_key_set is None:
                ew.write(row)
                output_data_lines += 1
                left_data_lines_kept += 1
            else:
                left_key: str = self.build_join_key(left_kr, left_join_idx_list, row)
                if left_key in joined_key_set:
                    ew.write(row)
                    output_data_lines += 1
                    left_data_lines_kept += 1
        # Flush the output file so far:
        ew.flush()

        if self.verbose:
            print("Processing the right input file: %s" % str(self.right_file_path), file=self.error_file, flush=True)
        right_shuffle_list: typing.List[int] = ew.build_shuffle_list(right_column_names)
        for row in right_kr:
            right_data_lines_read += 1
            if joined_key_set is None:
                ew.write(row, shuffle_list=right_shuffle_list)
                output_data_lines += 1
                right_data_lines_kept += 1
            else:
                right_key: str = self.build_join_key(right_kr, right_join_idx_list, row)
                if right_key in joined_key_set:
                    ew.write(row, shuffle_list=right_shuffle_list)
                    output_data_lines += 1
                    right_data_lines_kept += 1
            
        ew.close()
        if self.verbose:
            print("The join is complete", file=self.error_file, flush=True)
            print("%d left input data lines read, %d kept" % (left_data_lines_read, left_data_lines_kept), file=self.error_file, flush=True)
            print("%d right input data lines read, %d kept" % (right_data_lines_read, right_data_lines_kept), file=self.error_file, flush=True)
            print("%d data lines written." % output_data_lines, file=self.error_file, flush=True)
コード例 #15
0
ファイル: kgtkidbuilder.py プロジェクト: rijulvohra/kgtk
def main():
    """
    Test the KGTK ID builder.
    """
    parser: ArgumentParser = ArgumentParser()
    parser.add_argument(
        dest="input_file_path",
        help="The KGTK file with the input data (default=%(default)s)",
        type=Path,
        nargs="?",
        default="-")
    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    KgtkIdBuilderOptions.add_arguments(parser)

    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    # First create the KgtkReader.  It provides parameters used by the ID
    # column builder. Next, create the ID column builder, which provides a
    # possibly revised list of column names for the KgtkWriter.  Last, create
    # the KgtkWriter.

    # Open the input file.
    kr: KgtkReader = KgtkReader.open(
        args.input_file_path,
        error_file=error_file,
        options=reader_options,
        value_options=value_options,
        verbose=args.verbose,
        very_verbose=args.very_verbose,
    )

    # Create the ID builder.
    idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

    # Open the output file.
    ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                     args.output_file_path,
                                     mode=kr.mode,
                                     require_all_columns=True,
                                     prohibit_extra_columns=True,
                                     fill_missing_columns=False,
                                     gzip_in_parallel=False,
                                     verbose=args.verbose,
                                     very_verbose=args.very_verbose)

    # Process the input file, building IDs.
    idb.process(kr, ew)

    ew.close()
    kr.close()
コード例 #16
0
ファイル: kgtkexplode.py プロジェクト: yyht/kgtk
    def process(self):
        if len(self.column_name) == 0:
            raise ValueError("The name of the column to explode is empty.")

        selected_field_names: typing.List[str] = []
        field_name: str

        if self.type_names is not None:
            if self.verbose:
                print("Validate the names of the data types to extract.",
                      file=self.error_file,
                      flush=True)
            type_name: str
            for type_name in self.type_names:
                if type_name not in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS:
                    raise ValueError("Unknown data type name '%s'." %
                                     type_name)
                # Merge this KGTK data type's fields into the list of selected fields:
                for field_name in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS[
                        type_name]:
                    if field_name not in selected_field_names:
                        selected_field_names.append(field_name)

        if self.field_names is not None:
            # Forget the fields selected above, choose these instead:
            selected_field_names = []
            if self.verbose:
                print("Validate the names of the fields to extract.",
                      file=self.error_file,
                      flush=True)
            for field_name in self.field_names:
                if field_name not in KgtkValueFields.FIELD_NAMES:
                    raise ValueError("Unknown field name '%s'." % field_name)
                # Merge this field into the list of selected fields:
                if field_name not in selected_field_names:
                    selected_field_names.append(field_name)

        if len(selected_field_names) == 0:
            raise ValueError("The list of fields to explode is empty.")

        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Check that the source column '%s' is present." %
                  self.column_name,
                  file=self.error_file,
                  flush=True)
        if self.column_name not in kr.column_name_map:
            raise ValueError("Column name '%s' not found in the input file." %
                             self.column_name)
        column_idx: int = kr.column_name_map[self.column_name]

        if self.verbose:
            print(
                "Build the map of exploded columns and list of new column names",
                file=self.error_file,
                flush=True)
        explosion: typing.MutableMapping[str, idx] = {}
        column_names: typing.List[str] = kr.column_names.copy()
        for field_name in selected_field_names:
            exploded_name: str = self.prefix + field_name
            if self.verbose:
                print("Field '%s' becomes '%s'" % (field_name, exploded_name),
                      file=self.error_file,
                      flush=True)
            if exploded_name in explosion:
                raise ValueError(
                    "Field name '%s' is duplicated in the field list.")
            if exploded_name in kr.column_names:
                if self.overwrite_columns:
                    existing_idx = kr.column_name_map[exploded_name]
                    explosion[field_name] = existing_idx
                    if self.verbose:
                        print(
                            "Field '%s' is overwriting existing column '%s' (idx=%d)"
                            % (field_name, exploded_name, existing_idx),
                            file=self.error_file,
                            flush=True)
                else:
                    raise ValueError(
                        "Exploded column '%s' already exists and not allowed to overwrite"
                        % exploded_name)
            else:
                column_names.append(exploded_name)
                exploded_idx: int = len(column_names) - 1
                explosion[field_name] = exploded_idx
                if self.verbose:
                    print("Field '%s' becomes new column '%s' (idx=%d)" %
                          (field_name, exploded_name, exploded_idx),
                          file=self.error_file,
                          flush=True)
        new_column_count: int = len(column_names) - kr.column_count
        if self.verbose:
            print("%d columns + %d columns = %d columns" %
                  (kr.column_count, new_column_count, len(column_names)))
            print("Explosion length: %d" % len(explosion))

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(column_names,
                                         self.output_file_path,
                                         mode=kr.mode,
                                         output_format=self.output_format,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        if self.verbose:
            print("Expanding records from %s" % self.input_file_path,
                  file=self.error_file,
                  flush=True)
        input_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            # Parse the value for the colummn being exploded:
            item_to_explode: str = row[column_idx]
            value: KgtkValue = KgtkValue(item_to_explode,
                                         options=self.value_options,
                                         parse_fields=True)
            value.validate()
            if not value.is_valid():
                if self.verbose:
                    print("Not exploding invalid item '%s' in input line %d" %
                          (item_to_explode, input_line_count),
                          file=self.error_file,
                          flush=True)
                ew.write(row)  # This will be filled to the proper length
                output_line_count += 1
                continue

            if self.expand_list and value.is_list():
                if self.verbose:
                    print("Expanding a list: '%s'" % item_to_explode,
                          file=self.error_file,
                          flush=True)
                subvalue: KgtkValue
                for subvalue in value.get_list_items():
                    if self.very_verbose:
                        print("Exploding '%s'" % subvalue.value)
                    ew.write(
                        self.explode(subvalue, row, explosion,
                                     new_column_count))
                    output_line_count += 1
            else:
                if self.very_verbose:
                    print("Exploding '%s'" % value.value)
                ew.write(self.explode(value, row, explosion, new_column_count))
                output_line_count += 1

        if self.verbose:
            print("Read %d records, wrote %d records." %
                  (input_line_count, output_line_count),
                  file=self.error_file,
                  flush=True)

        ew.close()
コード例 #17
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        columns: typing.Optional[typing.List[str]],
        split_on_commas: bool,
        split_on_spaces: bool,
        strip_spaces: bool,
        all_except: bool,
        ignore_missing_columns: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if columns is not None:
            print("--columns=%s" % " ".join(columns), file=error_file)
        print("--split-on-commas=%s" % str(split_on_commas), file=error_file)
        print("--split-on-spaces=%s" % str(split_on_spaces), file=error_file)
        print("--strip-spaces=%s" % str(strip_spaces), file=error_file)
        print("--all-except=%s" % str(all_except), file=error_file)
        print("--ignore-missing-columns=%s" % str(ignore_missing_columns),
              file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        if columns is None:
            columns = []  # This simplifies matters.

        if split_on_spaces:
            # We will be very lenient, and allow space-seperated arguments
            # *inside* shell quoting, e.g.
            #
            # kgtk remove_columns -c 'name name2 name3'
            #
            # Do not enable this option if spaces are legal inside your
            # column names.
            columns = " ".join(columns).split()
        remove_columns: typing.List[str] = []
        arg: str
        column_name: str
        for arg in columns:
            if split_on_commas:
                for column_name in arg.split(","):
                    if strip_spaces:
                        column_name = column_name.strip()
                    if len(column_name) > 0:
                        remove_columns.append(column_name)
            else:
                if strip_spaces:
                    arg = arg.strip()
                if len(arg) > 0:
                    remove_columns.append(arg)
        if verbose:
            if all_except:
                print("Removing all columns except %d columns: %s" %
                      (len(remove_columns), " ".join(remove_columns)),
                      file=error_file,
                      flush=True)
            else:
                print("Removing %d columns: %s" %
                      (len(remove_columns), " ".join(remove_columns)),
                      file=error_file,
                      flush=True)
        if len(remove_columns) == 0:
            raise KGTKException("No columns to remove")

        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        output_column_names: typing.List[str]

        trouble_column_names: typing.List[str] = []
        if all_except:
            if not ignore_missing_columns:
                for column_name in remove_columns:
                    if column_name not in kr.column_names:
                        print("Error: cannot retain unknown column '%s'." %
                              column_name,
                              file=error_file,
                              flush=True)
                        trouble_column_names.append(column_name)

            output_column_names = []
            for column_name in kr.column_names:
                if column_name in remove_columns:
                    output_column_names.append(column_name)

        else:
            output_column_names = kr.column_names.copy()
            for column_name in remove_columns:
                if column_name in output_column_names:
                    output_column_names.remove(column_name)

                elif not ignore_missing_columns:
                    print("Error: cannot remove unknown column '%s'." %
                          column_name,
                          file=error_file,
                          flush=True)
                    trouble_column_names.append(column_name)

        if len(trouble_column_names) > 0:
            raise KGTKException("Unknown columns %s" %
                                " ".join(trouble_column_names))

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(output_column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        input_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1
            kw.write(row, shuffle_list=shuffle_list)

        if verbose:
            print("Processed %d rows." % (input_line_count),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
コード例 #18
0
def run(input_files: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = False,
        header_only: bool = False,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs # Whatever KgtkReaderOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    kgtk_files: typing.List[Path] = KGTKArgumentParser.get_input_file_list(input_files)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stderr if errors_to_stderr else sys.stdout

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-files: %s" % " ".join((str(kgtk_file) for kgtk_file in kgtk_files)), file=error_file)
        print("--header-only=%s" % str(header_only), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kgtk_file: Path
        for kgtk_file in kgtk_files:
            if verbose:
                print("\n====================================================", flush=True)
                if str(kgtk_file) != "-":
                    print("Validating '%s'" % str(kgtk_file), file=error_file, flush=True)
                else:
                    print ("Validating from stdin", file=error_file, flush=True)

            kr: KgtkReader = KgtkReader.open(kgtk_file,
                                             error_file=error_file,
                                             options=reader_options,
                                             value_options=value_options,
                                             verbose=verbose,
                                             very_verbose=very_verbose)
        
            if header_only:
                kr.close()
                if verbose:
                    print("Validated the header only.", file=error_file, flush=True)
            else:
                line_count: int = 0
                row: typing.List[str]
                for row in kr:
                    line_count += 1
                if verbose:
                    print("Validated %d data lines" % line_count, file=error_file, flush=True)
        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
コード例 #19
0
def run(
    input_file: KGTKFiles,
    output_file: KGTKFiles,
    columns: typing.Optional[typing.List[str]] = None,
    locale: str = "C",
    reverse_sort: bool = False,
    reverse_columns: typing.Optional[typing.List[str]] = None,
    numeric_sort: bool = False,
    numeric_columns: typing.Optional[typing.List[str]] = None,
    pure_python: bool = False,
    extra: typing.Optional[str] = None,
    bash_command: str = "bash",
    bzip2_command: str = "bzip2",
    gzip_command: str = "gzip",
    pgrep_command: str = "pgrep",
    sort_command: str = "sort",
    xz_command: str = "xz",
    errors_to_stdout: bool = False,
    errors_to_stderr: bool = True,
    show_options: bool = False,
    verbose: bool = False,
    very_verbose: bool = False,
    **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.       
) -> int:
    from io import StringIO
    import os
    from pathlib import Path
    import sh  # type: ignore
    import sys
    import typing

    from kgtk.cli_entry import progress_startup
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_path: Path = KGTKArgumentParser.get_input_file(input_file)
    output_path: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    def python_sort():
        if numeric_columns is not None and len(numeric_columns) > 0:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support numeric column sorts.'
            )

        if reverse_columns is not None and len(reverse_columns) > 0:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support reverse column sorts.'
            )

        if verbose:
            print("Opening the input file: %s" % str(input_path),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_path,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sort_idx: int
        key_idxs: typing.List[int] = []
        if columns is not None and len(columns) > 0:
            # Process the list of column names, including splitting
            # comma-separated lists of column names.
            column_name: str
            for column_name in columns:
                column_name_2: str
                for column_name_2 in column_name.split(","):
                    column_name_2 = column_name_2.strip()
                    if len(column_name_2) == 0:
                        continue
                    if column_name_2.isdigit():
                        sort_idx = int(column_name_2)
                        if sort_idx > len(kr.column_names):
                            kr.close()
                            cleanup()
                            raise KGTKException(
                                "Invalid column number %d (max %d)." %
                                (sort_idx, len(kr.column_names)))
                        key_idxs.append(sort_idx - 1)
                    else:
                        if column_name_2 not in kr.column_names:
                            kr.close()
                            cleanup()
                            raise KGTKException("Unknown column_name %s" %
                                                column_name_2)
                        key_idxs.append(kr.column_name_map[column_name_2])
        else:
            if kr.is_node_file:
                key_idxs.append(kr.id_column_idx)

            elif kr.is_edge_file:
                if kr.id_column_idx >= 0:
                    key_idxs.append(kr.id_column_idx)

                key_idxs.append(kr.node1_column_idx)
                key_idxs.append(kr.label_column_idx)
                key_idxs.append(kr.node2_column_idx)
            else:
                cleanup()
                raise KGTKException(
                    "Unknown KGTK file mode, please specify the sorting columns."
                )

        if verbose:
            print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]),
                  file=error_file,
                  flush=True)

        if numeric_sort and len(key_idxs) > 1:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support numeric sorts on multiple columns.'
            )

        lines: typing.MutableMapping[typing.Union[str, float],
                                     typing.List[typing.List[str]]] = dict()

        progress_startup()
        key: typing.Union[str, float]
        row: typing.List[str]
        for row in kr:
            key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx]
                                                      for idx in key_idxs)
            if numeric_sort:
                key = float(key)
            if key in lines:
                # There are multiple rows with the same key.  Make this a stable sort.
                lines[key].append(row)
            else:
                lines[key] = [row]
        if verbose:
            print("\nRead %d data lines." % len(lines),
                  file=error_file,
                  flush=True)

        kw = KgtkWriter.open(kr.column_names,
                             output_path,
                             mode=KgtkWriter.Mode[kr.mode.name],
                             verbose=verbose,
                             very_verbose=very_verbose)

        for key in sorted(lines.keys(), reverse=reverse_sort):
            for row in lines[key]:
                kw.write(row)

        kw.close()
        kr.close()

    if pure_python:
        return python_sort()

    try:
        global header_read_fd
        global header_write_fd
        header_read_fd, header_write_fd = os.pipe()
        os.set_inheritable(header_write_fd, True)
        if verbose:
            print("header pipe: read_fd=%d write_fd=%d" %
                  (header_read_fd, header_write_fd),
                  file=error_file,
                  flush=True)

        global sortopt_read_fd
        global sortopt_write_fd
        sortopt_read_fd, sortopt_write_fd = os.pipe()
        os.set_inheritable(sortopt_read_fd, True)
        if verbose:
            print("sort options pipe: read_fd=%d write_fd=%d" %
                  (sortopt_read_fd, sortopt_write_fd),
                  file=error_file,
                  flush=True)

        locale_envar: str = "LC_ALL=%s" % locale if len(locale) > 0 else ""

        # Note: "read -u n", used below, is not supported by some shells.
        # bash and zsh support it.
        # ash, csh, dash, and tcsh do not.
        # The original standard Bourne shell, sh, does not.
        # ksh might do it, if the FD number is a single digit.
        cmd: str = "".join((
            "{ IFS= read -r header ; ",  # Read the header line
            " { printf \"%s\\n\" \"$header\" >&" + str(header_write_fd) +
            " ; } ; ",  # Send the header to Python
            " printf \"%s\\n\" \"$header\" ; ",  # Send the header to standard output (which may be redirected to a file, below).
            " IFS= read -u " + str(sortopt_read_fd) +
            " -r options ; ",  # Read the sort command options from Python.
            " %s %s -t '\t' $options ; } " % (
                locale_envar, sort_command
            ),  # Sort the remaining input lines using the options read from Python.
        ))
        if str(output_path) != "-":
            # Do we want to compress the output?
            output_suffix: str = output_path.suffix.lower()
            if output_suffix in [".gz", ".z"]:
                if verbose:
                    print("gzip output file: %s" % repr(str(output_path)),
                          file=error_file,
                          flush=True)
                cmd += " | " + gzip_command + " -"

            elif output_suffix in [".bz2", ".bz"]:
                if verbose:
                    print("bzip2 output file: %s" % repr(str(output_path)),
                          file=error_file,
                          flush=True)
                cmd += " | " + bzip2_command + " -z"

            elif output_suffix in [".xz", ".lzma"]:
                if verbose:
                    print("xz output file: %s" % repr(str(output_path)),
                          file=error_file,
                          flush=True)
                cmd += " | " + xz_command + " -z -"

            # Feed the sorted output to the named file.  Otherwise, the sorted
            # output goes to standard output without passing through Python.
            cmd += " > " + repr(str(output_path))

        if verbose:
            print("sort command: %s" % cmd, file=error_file, flush=True)

        global cat_proc
        cat_proc = None
        global cmd_proc
        cmd_proc = None

        def cat_done(cmd, success, exit_code):
            # When the cat command finishes, monitor the progress of the sort command.
            if verbose:
                print("\nDone reading the input file",
                      file=error_file,
                      flush=True)
            if cmd_proc is None:
                return

            # Locate the sort command using pgrep
            buf = StringIO()
            try:
                sh_pgrep = sh.Command(pgrep_command)
                sh_pgrep("-g",
                         cmd_proc.pgid,
                         "--newest",
                         sort_command,
                         _out=buf)
                pgrep_output = buf.getvalue()
                if len(pgrep_output) == 0:
                    if verbose:
                        print("Unable to locate the sort command.",
                              file=error_file,
                              flush=True)
                    return
                sort_pid = int(pgrep_output)
            except Exception as e:
                if verbose:
                    print("Exception looking for sort command: %s" % str(e),
                          file=error_file,
                          flush=True)
                return
            finally:
                buf.close()

            if verbose:
                print("Monitoring the sort command (pid=%d)" % sort_pid,
                      file=error_file,
                      flush=True)
            progress_startup(pid=sort_pid)

        if str(input_path) == "-":
            # Read from standard input.
            #
            # Sh version 1.13 or greater is required for _pass_fds.
            sh_bash = sh.Command(bash_command)
            cmd_proc = sh_bash("-c",
                               cmd,
                               _in=sys.stdin,
                               _out=sys.stdout,
                               _err=sys.stderr,
                               _bg=True,
                               _bg_exc=False,
                               _internal_bufsize=1,
                               _pass_fds={header_write_fd, sortopt_read_fd})

            # It would be nice to monitor the sort command here.  Unfortunately, there
            # is a race condition that makes this difficult.  We could loop until the
            # sort command is created, then monitor it.

        else:
            # Feed the named file into the data processing pipeline,
            input_suffix: str = input_path.suffix.lower()
            if input_suffix in [".gz", ".z"]:
                if verbose:
                    print("gunzip input file: %s" % repr(str(input_path)),
                          file=error_file,
                          flush=True)
                sh_gzip = sh.Command(gzip_command)
                cat_proc = sh_gzip(input_path,
                                   "-dc",
                                   _in=sys.stdin,
                                   _piped=True,
                                   _err=sys.stderr,
                                   _bg=True,
                                   _bg_exc=False,
                                   _internal_bufsize=1,
                                   _done=cat_done)

                if verbose:
                    print("full command: %s -dc %s | %s" %
                          (gzip_command, repr(str(input_path)), cmd),
                          file=error_file,
                          flush=True)

            elif input_suffix in [".bz2", ".bz"]:
                if verbose:
                    print("bunzip2 input file: %s" % repr(str(input_path)),
                          file=error_file,
                          flush=True)
                sh_bzip2 = sh.Command(bzip2_command)
                cat_proc = sh_bzip2(input_path,
                                    "-dc",
                                    _in=sys.stdin,
                                    _piped=True,
                                    _err=sys.stderr,
                                    _bg=True,
                                    _bg_exc=False,
                                    _internal_bufsize=1,
                                    _done=cat_done)

                if verbose:
                    print("full command: %s -dc %s | %s" %
                          (bzip2_command, repr(str(input_path)), cmd),
                          file=error_file,
                          flush=True)

            elif input_suffix in [".xz", ".lzma"]:
                if verbose:
                    print("unxz input file: %s" % repr(str(input_path)),
                          file=error_file,
                          flush=True)
                sh_xz = sh.Command(xz_command)
                cat_proc = sh_xz(input_path,
                                 "-dc",
                                 _in=sys.stdin,
                                 _piped=True,
                                 _err=sys.stderr,
                                 _bg=True,
                                 _bg_exc=False,
                                 _internal_bufsize=1,
                                 _done=cat_done)
                if verbose:
                    print("full command: %s -dc %s | %s" %
                          (xz_command, repr(str(input_path)), cmd),
                          file=error_file,
                          flush=True)

            else:
                if verbose:
                    print("input file: %s" % repr(str(input_path)),
                          file=error_file,
                          flush=True)
                cat_proc = sh.cat(input_path,
                                  _in=sys.stdin,
                                  _piped=True,
                                  _err=sys.stderr,
                                  _bg=True,
                                  _bg_exc=False,
                                  _internal_bufsize=1,
                                  _done=cat_done)
                if verbose:
                    print("full command: cat %s | %s" %
                          (repr(str(input_path)), cmd),
                          file=error_file,
                          flush=True)

            # If enabled, monitor the progress of reading the input file.
            # Since we do not have access to the pid of the sort command,
            # we cannot monitor the progress of the merge phases.
            if verbose:
                print("Monitoring the cat command (pid=%d)." % cat_proc.pid,
                      file=error_file,
                      flush=True)
            progress_startup(pid=cat_proc.pid)

            # Sh version 1.13 or greater is required for _pass_fds.
            sh_bash = sh.Command(bash_command)
            cmd_proc = sh_bash(cat_proc,
                               "-c",
                               cmd,
                               _out=sys.stdout,
                               _err=sys.stderr,
                               _bg=True,
                               _bg_exc=False,
                               _internal_bufsize=1,
                               _pass_fds={header_write_fd, sortopt_read_fd})
            # Since we do not have access to the pid of the sort command,
            # we cannot monitor the progress of the merge phases.

        if verbose:
            print("Running the sort script (pid=%d)." % cmd_proc.pid,
                  file=error_file,
                  flush=True)

        if verbose:
            print("Reading the KGTK input file header line with KgtkReader",
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            Path("<%d" % header_read_fd),
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        if verbose:
            print("KGTK header: %s" % " ".join(kr.column_names),
                  file=error_file,
                  flush=True)

        sort_options: str = ""
        if reverse_sort:
            sort_options += " --reverse"
        if numeric_sort:
            sort_options += " --numeric"

        if extra is not None and len(extra) > 0:
            sort_options += " " + extra

        # We will consume entries in reverse_columns and numeric_columns,
        # then complain if any are left over.
        if reverse_columns is not None:
            reverse_columns = reverse_columns[:]  # Protect against modifying a shared list.
        if numeric_columns is not None:
            numeric_columns = numeric_columns[:]  # Protect against modifying a shared list.

        column_name: str
        sort_idx: int
        if columns is not None and len(columns) > 0:
            # Process the list of column names, including splitting
            # comma-separated lists of column names.
            for column_name in columns:
                column_name_2: str
                for column_name_2 in column_name.split(","):
                    column_name_2 = column_name_2.strip()
                    if len(column_name_2) == 0:
                        continue
                    if column_name_2.isdigit():
                        sort_idx = int(column_name_2)
                        if sort_idx > len(kr.column_names):
                            kr.close()
                            cleanup()
                            raise KGTKException(
                                "Invalid column number %d (max %d)." %
                                (sort_idx, len(kr.column_names)))
                    else:
                        if column_name_2 not in kr.column_names:
                            kr.close()
                            cleanup()
                            raise KGTKException("Unknown column_name %s" %
                                                repr(column_name_2))
                        sort_idx = kr.column_name_map[column_name_2] + 1
                    sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                    if reverse_columns is not None and column_name_2 in reverse_columns:
                        sort_options += "r"
                        reverse_columns.remove(column_name_2)
                    if numeric_columns is not None and column_name_2 in numeric_columns:
                        sort_options += "n"
                        numeric_columns.remove(column_name_2)
        else:
            # TODO: support the case where the column name in reverse_columns
            # or numeric_columns is an alias of the name used in the file header.
            if kr.is_node_file:
                sort_idx = kr.id_column_idx + 1
                sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                column_name = kr.column_names[kr.id_column_idx]
                if reverse_columns is not None and column_name in reverse_columns:
                    sort_options += "r"
                    reverse_columns.remove(column_name)
                if numeric_columns is not None and column_name in numeric_columns:
                    sort_options += "n"
                    numeric_columns.remove(column_name)

            elif kr.is_edge_file:
                if kr.id_column_idx >= 0:
                    sort_idx = kr.id_column_idx + 1
                    sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                    column_name = kr.column_names[kr.id_column_idx]
                    if reverse_columns is not None and column_name in reverse_columns:
                        sort_options += "r"
                        reverse_columns.remove(column_name)
                    if numeric_columns is not None and column_name in numeric_columns:
                        sort_options += "n"
                        numeric_columns.remove(column_name)

                sort_idx = kr.node1_column_idx + 1
                sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                column_name = kr.column_names[kr.node1_column_idx]
                if reverse_columns is not None and column_name in reverse_columns:
                    sort_options += "r"
                    reverse_columns.remove(column_name)
                if numeric_columns is not None and column_name in numeric_columns:
                    sort_options += "n"
                    numeric_columns.remove(column_name)

                sort_idx = kr.label_column_idx + 1
                sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                column_name = kr.column_names[kr.label_column_idx]
                if reverse_columns is not None and column_name in reverse_columns:
                    sort_options += "r"
                    reverse_columns.remove(column_name)
                if numeric_columns is not None and column_name in numeric_columns:
                    sort_options += "n"
                    numeric_columns.remove(column_name)

                sort_idx = kr.node2_column_idx + 1
                sort_options += " -k %d,%d" % (sort_idx, sort_idx)
                column_name = kr.column_names[kr.node2_column_idx]
                if reverse_columns is not None and column_name in reverse_columns:
                    sort_options += "r"
                    reverse_columns.remove(column_name)
                if numeric_columns is not None and column_name in numeric_columns:
                    numeric_columns.remove(column_name)
                    sort_options += "n"

            else:
                cleanup()
                raise KGTKException(
                    "Unknown KGTK file mode, please specify the sorting columns."
                )

        # Check for unconsumed entries in reverse_columns and numeric_columns:
        if reverse_columns is not None and len(reverse_columns) > 0:
            raise KGTKException("Unknown reverse column(s) %s" % " ".join(
                [repr(column_name) for column_name in reverse_columns]))
        if numeric_columns is not None and len(numeric_columns) > 0:
            raise KGTKException("Unknown numeric column(s) %s" % " ".join(
                [repr(column_name) for column_name in numeric_columns]))

        if verbose:
            print("sort options: %s" % sort_options,
                  file=error_file,
                  flush=True)

        kr.close()  # We are done with the KgtkReader now.

        # Send the sort options back to the data processing pipeline.
        with open(sortopt_write_fd, "w") as options_file:
            options_file.write(sort_options + "\n")

        if verbose:
            print("\nWaiting for the sort command to complete.\n",
                  file=error_file,
                  flush=True)
        cmd_proc.wait()

        if verbose:
            print("Cleanup.", file=error_file, flush=True)
        cleanup()

        return 0

    except Exception as e:
        # import traceback
        # traceback.print_tb(sys.exc_info()[2], 10)
        raise KGTKException('INTERNAL ERROR: ' + type(e).__module__ + '.' +
                            str(e) + '\n')
コード例 #20
0
ファイル: graph_statistics.py プロジェクト: rijulvohra/kgtk
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        undirected: bool,
        compute_degrees: bool,
        compute_pagerank: bool,
        compute_hits: bool,
        log_file: str,
        statistics_only: bool,
        vertex_in_degree: str,
        vertex_out_degree: str,
        vertex_pagerank: str,
        vertex_auth: str,
        vertex_hubs: str,
        top_n: int,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool import centrality
    from kgtk.exceptions import KGTKException
    import kgtk.gt.analysis_utils as gtanalysis
    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    v_prop_dict = {
        'vertex_pagerank': vertex_pagerank,
        'vertex_hubs': vertex_hubs,
        'vertex_auth': vertex_auth
    }
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later
        directions = ['in', 'out', 'total']
        id_col = 'name'
        output_columns = ["node1", "label", "node2", "id"]

        if verbose:
            print('loading the KGTK input file...\n',
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        sub: int = kr.get_node1_column_index()
        if sub < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred: int = kr.get_label_column_index()
        if pred < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj: int = kr.get_node2_column_index()
        if obj < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        if sub < 0 or pred < 0 or obj < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred]

        G2 = load_graph_from_kgtk(kr,
                                  directed=not undirected,
                                  ecols=(sub, obj),
                                  verbose=verbose,
                                  out=error_file)
        if verbose:
            print('graph loaded! It has %d nodes and %d edges.' %
                  (G2.num_vertices(), G2.num_edges()),
                  file=error_file,
                  flush=True)

        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        with open(log_file, 'w') as writer:
            writer.write('graph loaded! It has %d nodes and %d edges\n' %
                         (G2.num_vertices(), G2.num_edges()))
            writer.write('\n###Top relations:\n')
            for rel, freq in gtanalysis.get_topN_relations(
                    G2, pred_property=predicate):
                writer.write('%s\t%d\n' % (rel, freq))

            if compute_degrees:
                writer.write('\n###Degrees:\n')
                for direction in directions:
                    degree_data = gtanalysis.compute_node_degree_hist(
                        G2, direction)
                    max_degree = len(degree_data) - 1
                    mean_degree, std_degree = gtanalysis.compute_avg_node_degree(
                        G2, direction)
                    writer.write(
                        '%s degree stats: mean=%f, std=%f, max=%d\n' %
                        (direction, mean_degree, std_degree, max_degree))

            if compute_pagerank:
                writer.write('\n###PageRank\n')
                v_pr = G2.new_vertex_property('float')
                centrality.pagerank(G2, prop=v_pr)
                G2.properties[('v', 'vertex_pagerank')] = v_pr
                writer.write('Max pageranks\n')
                result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank',
                                                     top_n, id_col)
                for n_id, n_label, pr in result:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr))

            if compute_hits:
                writer.write('\n###HITS\n')
                hits_eig, G2.vp['vertex_hubs'], G2.vp[
                    'vertex_auth'] = gtanalysis.compute_hits(G2)
                writer.write('HITS hubs\n')
                main_hubs = gtanalysis.get_topn_indices(
                    G2, 'vertex_hubs', top_n, id_col)
                for n_id, n_label, hubness in main_hubs:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness))
                writer.write('HITS auth\n')
                main_auth = gtanalysis.get_topn_indices(
                    G2, 'vertex_auth', top_n, id_col)
                for n_id, n_label, authority in main_auth:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority))

        id_count = 0
        if not statistics_only:
            for e in G2.edges():
                sid, oid = e
                lbl = G2.ep[predicate][e]
                kw.write([
                    G2.vp[id_col][sid], lbl, G2.vp[id_col][oid],
                    '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1

        id_count = 0
        for v in G2.vertices():
            v_id = G2.vp[id_col][v]
            kw.write([
                v_id, vertex_in_degree,
                str(v.in_degree()), '{}-{}-{}'.format(v_id, vertex_in_degree,
                                                      id_count)
            ])
            id_count += 1
            kw.write([
                v_id, vertex_out_degree,
                str(v.out_degree()), '{}-{}-{}'.format(v_id, vertex_out_degree,
                                                       id_count)
            ])
            id_count += 1

            for vprop in G2.vertex_properties.keys():
                if vprop == id_col:
                    continue
                kw.write([
                    v_id, v_prop_dict[vprop],
                    str(G2.vp[vprop][v]),
                    '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count)
                ])
                id_count += 1

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
コード例 #21
0
ファイル: unique.py プロジェクト: usbader/kgtk
    def process(self):
        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        if self.column_name not in kr.column_name_map:
            raise ValueError("Column %s is not in the input file" %
                             (self.column_name))
        column_idx: int = kr.column_name_map[self.column_name]

        where_column_idx: int = -1
        where_value_set: typing.Set[str] = {}
        if self.where_column_name is not None:
            if self.where_column_name not in kr.column_name_map:
                raise ValueError(
                    "Where column '%s' is not in the input file." %
                    (self.where_column_name))
            where_column_idx = kr.column_name_map[self.where_column_name]
            if self.where_values is None or len(self.where_values) == 0:
                raise ValueError("Where column '%s' but no values to test." %
                                 (self.where_column_name))
            else:
                where_value_set = set(self.where_values)

        if self.verbose:
            print("Counting unique values from the %s column in %s" %
                  (self.column_name, self.input_file_path),
                  file=self.error_file,
                  flush=True)
        input_line_count: int = 0
        skip_line_count: int = 0

        value_counts: typing.MutableMapping[str, int] = {}

        row: typing.list[str]
        for row in kr:
            input_line_count += 1
            if where_column_idx >= 0:
                if row[where_column_idx] not in where_value_set:
                    skip_line_count += 1
                    continue
            value: str = row[column_idx]
            if len(value) == 0:
                value = self.empty_value
            if len(value) > 0:
                value = self.prefix + value
                value_counts[value] = value_counts.get(value, 0) + 1

        if self.verbose:
            print(
                "Read %d records, skipped %d, found %d unique non-empty values, %d empty values."
                % (input_line_count, skip_line_count, len(value_counts),
                   input_line_count - len(value_counts)),
                file=self.error_file,
                flush=True)

        # No node mode we can't open the output file until we are done reading
        # the input file, because we need the list of uniqueue values to
        # build the column list.
        output_columns: typing.List[str]
        if self.output_format == "edge":
            output_columns = ["node1", "label", "node2"]
        elif self.output_format == "node":
            output_columns = ["id"]
            for value in sorted(value_counts.keys()):
                # TODO: provide a way to override this check.
                if value in KgtkFormat.NODE1_COLUMN_NAMES:
                    raise ValueError(
                        "Cannot write a KGTK node file with a column named '%s'."
                        % value)
                output_columns.append(value)
        else:
            raise ValueError("Unknown output format %s" %
                             str(self.output_format))

        if self.verbose:
            print("Opening the output file: %s" % self.output_file_path,
                  file=self.error_file,
                  flush=True)

        ew: KgtkWriter = KgtkWriter.open(output_columns,
                                         self.output_file_path,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        if self.output_format == "edge":
            for value in sorted(value_counts.keys()):
                ew.write([value, self.label_value, str(value_counts[value])])
        elif self.output_format == "node":
            row = [self.column_name]
            for value in sorted(value_counts.keys()):
                row.append(str(value_counts[value]))
            ew.write(row)
        else:
            raise ValueError("Unknown output format %s" %
                             str(self.output_format))

        ew.close()
コード例 #22
0
    def process(self):

        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )
        self.id_column_idx = kr.id_column_idx

        # If requested, create the ID column builder.
        # Assemble the list of output column names.
        output_column_names: typing.List[str]
        idb: typing.Optional[KgtkIdBuilder] = None
        if self.build_id:
            if self.idbuilder_options is None:
                raise ValueError(
                    "ID build requested but ID builder options are missing")
            idb = KgtkIdBuilder.new(kr, self.idbuilder_options)
            output_column_names = idb.column_names
        else:
            output_column_names = kr.column_names

        # Build the list of key column edges:
        key_idx_list: typing.List[int] = []

        if len(self.key_column_names) == 0:
            if kr.is_edge_file:
                # Add the KGTK edge file required columns.
                key_idx_list.append(kr.node1_column_idx)
                key_idx_list.append(kr.label_column_idx)
                key_idx_list.append(kr.node2_column_idx)
                if not self.compact_id and kr.id_column_idx >= 0:
                    key_idx_list.append(kr.id_column_idx)

            elif kr.is_node_file:
                # Add the KGTK node file required column:
                key_idx_list.append(kr.id_column_idx)

            else:
                raise ValueError(
                    "The input file is neither an edge nor a node file.  Key columns must be supplied."
                )

        else:
            # Append columns to the list of key column indices,
            # silently removing duplicates, but complaining about unknown names.
            #
            # TODO: warn about duplicates?
            column_name: str
            for column_name in self.key_column_names:
                if column_name not in kr.column_name_map:
                    raise ValueError("Column %s is not in the input file" %
                                     (repr(column_name)))
                key_idx: int = kr.column_name_map[column_name]
                if key_idx not in key_idx_list:
                    key_idx_list.append(key_idx)

        if self.verbose:
            print("key indexes: %s" %
                  " ".join([str(idx) for idx in key_idx_list]),
                  file=self.error_file,
                  flush=True)

        self.keep_first_idx_list.clear()
        if len(self.keep_first_names) > 0:
            keep_first_name: str
            for keep_first_name in self.keep_first_names:
                if keep_first_name not in kr.column_name_map:
                    raise ValueError(
                        "Keep first column %s is not in the input file" %
                        (repr(keep_first_name)))
                keep_first_idx: int = kr.column_name_map[keep_first_name]
                if keep_first_idx in key_idx_list:
                    raise ValueError(
                        "Keep first column %s may not be a key column" %
                        (repr(keep_first_name)))
                self.keep_first_idx_list.append(keep_first_idx)
            if self.verbose:
                print("keep first indexes: %s" %
                      " ".join([str(idx) for idx in self.keep_first_idx_list]),
                      file=self.error_file,
                      flush=True)

        if self.deduplicate:
            if self.compact_id and kr.id_column_idx >= 0 and kr.id_column_idx not in self.keep_first_idx_list:
                self.keep_first_idx_list.append(kr.id_column_idx)

            # Any columns that aren't in the keep_first list and aren't
            # already in key_idx_list will be appended to key_idx_list:
            idx: int
            for idx in range(kr.column_count):
                if idx not in self.keep_first_idx_list and idx not in key_idx_list:
                    key_idx_list.append(idx)

            if self.verbose:
                print("revised key indexes: %s" %
                      " ".join([str(idx) for idx in key_idx_list]),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            key_idx_list_str: typing.List[str] = []
            for key_idx in key_idx_list:
                key_idx_list_str.append(str(key_idx))
            print("key indexes: %s" % " ".join(key_idx_list_str),
                  file=self.error_file,
                  flush=True)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(
            output_column_names,
            self.output_file_path,
            mode=kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            use_mgzip=self.reader_options.use_mgzip,  # Hack!
            mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
            gzip_in_parallel=False,
            verbose=self.verbose,
            very_verbose=self.very_verbose)

        # Open the optional list output file.
        lew: typing.Optional[KgtkWriter] = None
        if self.list_output_file_path is not None:
            lew = KgtkWriter.open(
                output_column_names,
                self.list_output_file_path,
                mode=kr.mode,
                require_all_columns=False,
                prohibit_extra_columns=True,
                fill_missing_columns=True,
                use_mgzip=self.reader_options.use_mgzip,  # Hack!
                mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
                gzip_in_parallel=False,
                verbose=self.verbose,
                very_verbose=self.very_verbose)

        input_line_count: int = 0
        row: typing.List[str] = []
        input_key: str
        prev_input_key: typing.Optional[str] = None
        going_up: typing.Optional[bool] = None
        if self.sorted_input:
            if self.verbose:
                print("Reading the input data from %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if self.verify_sort:
                    if prev_input_key is None:
                        prev_input_key = input_key
                    else:
                        if going_up is None:
                            if prev_input_key < input_key:
                                going_up = True
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                going_up = False
                                prev_input_key = input_key
                            else:
                                pass  # No change in input key
                        elif going_up:
                            if prev_input_key < input_key:
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                raise ValueError(
                                    "Line %d sort violation going up: prev='%s' curr='%s'"
                                    % (input_line_count,
                                       prev_input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR),
                                       input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR)))
                            else:
                                pass  # No change in input_key
                        else:
                            if prev_input_key > input_key:
                                prev_input_key = input_key
                            elif prev_input_key < input_key:
                                raise ValueError(
                                    "Line %d sort violation going down: prev='%s' curr='%s'"
                                    % (input_line_count,
                                       prev_input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR),
                                       input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR)))
                            else:
                                pass  # No change in input_key

                self.process_row(input_key, row, input_line_count, idb, ew,
                                 lew)

        else:
            if self.verbose:
                print("Sorting the input data from %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            # Map key values to lists of input and output data.
            input_map: typing.MutableMapping[
                str, typing.List[typing.List[str]]] = {}

            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if input_key in input_map:
                    # Append the row to an existing list for that key.
                    input_map[input_key].append(row)
                else:
                    # Create a new list of rows for this key.
                    input_map[input_key] = [row]

            if self.verbose:
                print("Processing the sorted input data",
                      file=self.error_file,
                      flush=True)

            for input_key in sorted(input_map.keys()):
                for row in input_map[input_key]:
                    self.process_row(input_key, row, input_line_count, idb, ew,
                                     lew)

        # Flush the final row, if any.  We pass the last row read for
        # feedback, such as an ID uniqueness violation.
        self.process_row("", row, input_line_count, idb, ew, lew, flush=True)

        if self.verbose:
            print("Read %d records, excluded %d records, wrote %d records." %
                  (input_line_count, self.excluded_row_count,
                   self.output_line_count),
                  file=self.error_file,
                  flush=True)
            if lew is not None:
                print("Wrote %d list ouput records." %
                      (self.list_output_line_count),
                      file=self.error_file,
                      flush=True)

        ew.close()
        if lew is not None:
            lew.close()
コード例 #23
0
    def get_initial_namespaces(self) -> int:
        # Read the namespaces.  If no file, use a limited internal
        # default.
        if self.namespace_file_path is None:
            return self.get_default_namespaces()

        if self.verbose:
            print("Processing namespace file file %s" %
                  str(self.reject_file_path),
                  file=self.error_file,
                  flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.namespace_file_path,
            mode=KgtkReaderMode.EDGE,
            options=self.reader_options,
            error_file=self.error_file,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )
        namespace_line_count: int = 0
        namespace_row: typing.List[str]
        for namespace_row in kr:
            namespace_line_count += 1
            if namespace_row[
                    kr.label_column_idx] == self.prefix_expansion_label:
                namespace_id: str = namespace_row[kr.node1_column_idx]
                namespace_prefix: str = namespace_row[kr.node2_column_idx]
                if not (namespace_prefix.startswith('"')
                        and namespace_prefix.endswith('"')):
                    if self.verbose:
                        print(
                            "The namespace prefix must be a KGKT string: '%s'"
                            % namespace_prefix,
                            file=self.error_file,
                            flush=True)
                    continue

                # Strip the delimiting double quotes from the KGTk string.
                # Per RFC 3986, internal double quotes are not allowed in
                # a URL unless percent-encoded, so we needen't bother looking
                # for them.
                namespace_prefix = namespace_prefix[1:-1]

                if namespace_prefix in self.namespace_prefixes:
                    if self.verbose:
                        print("Duplicate initial namespace prefix '%s'" %
                              namespace_prefix,
                              file=self.error_file,
                              flush=True)
                else:
                    self.namespace_prefixes[namespace_prefix] = namespace_id
                if namespace_id in self.namespace_ids:
                    if self.verbose:
                        print("Duplicate initial namespace id '%s'" %
                              namespace_id,
                              file=self.error_file,
                              flush=True)
                else:
                    self.namespace_ids[namespace_id] = namespace_prefix
            else:
                if self.verbose:
                    print("Ignoring initial namespace label '%s'" %
                          namespace_row[kr.label_column_idx],
                          file=self.error_file,
                          flush=True)

        return namespace_line_count
コード例 #24
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        columns: typing.Optional[typing.List[str]] = None,
        labels: typing.Optional[typing.List[str]] = None,
        id_column_name: typing.Optional[str] = None,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    import os

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)

        if columns is not None:
            print("--columns=%s" % " ".join(columns), file=error_file)
        if labels is not None:
            print("--labels=%s" % " ".join(labels), file=error_file)
        if id_column_name is not None:
            print("--id-column=%s" % id_column_name, file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if verbose:
        print("Starting normalize_nodes pid=%d" % (os.getpid()),
              file=error_file,
              flush=True)

    label_map: typing.MutableMapping[str, str] = dict()
    if labels is not None and len(labels) > 0:
        if columns is None:
            raise KGTKException(
                "--columns must be supplied when --labels is used.")
        if len(columns) != len(labels):
            raise KGTKException("%d columns were supplied, but %d labels." %
                                (len(columns), len(labels)))
        idx: int
        label: str
        for idx, label in enumerate(labels):
            label_map[columns[idx]] = label

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        id_column_idx: int = kr.get_id_column_index(id_column_name)
        if id_column_idx < 0:
            raise KGTKException("Unknown ID column %s" % repr(id_column_name))

        output_column_names: typing.List[str] = [
            KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2
        ]

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(output_column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        input_line_count: int = 0
        output_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            node1_value: str = row[id_column_idx]

            column_idx: int
            column_name: str
            for column_idx, column_name in enumerate(kr.column_names):
                if column_idx == id_column_idx:
                    continue
                if columns is not None and column_name not in columns:
                    continue

                label_value: str = label_map.get(column_name, column_name)

                new_value: str = row[column_idx]
                if len(new_value) == 0:
                    continue  # ignore empty values.

                # The column value might contain a KGTK list.  Since node2 isn't supposed
                # to contain lists, we'll split it.
                node2_value: str
                for node2_value in KgtkValue.split_list(new_value):
                    if len(node2_value) == 0:
                        continue  # node2 shouldn't contain empty values

                    output_row: typing.List[str] = [
                        node1_value, label_value, node2_value
                    ]
                    kw.write(output_row)
                    output_line_count += 1

        if verbose:
            print("Read %d node rows, wrote %d edge rows." %
                  (input_line_count, output_line_count),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
コード例 #25
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        # First create the KgtkReader.  It provides parameters used by the ID
        # column builder. Next, create the ID column builder, which provides a
        # possibly revised list of column names for the KgtkWriter.  Create
        # the KgtkWriter.  Last, process the data stream.

        # Open the input file.
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Create the ID builder.
        idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        # Process the input file, building IDs.
        idb.process(kr, ew)

        # Clean up.
        ew.close()
        kr.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
コード例 #26
0
ファイル: validate-properties.py プロジェクト: nicklein/kgtk
def run(
        input_file: KGTKFiles,
        pattern_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        grouped_input: bool = False,
        reject_node1_groups: bool = False,
        no_complaints: bool = False,
        complain_immediately: bool = False,
        add_isa_column: bool = False,
        isa_column_name: str = "isa;node2",
        autovalidate: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = False,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkReaderOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderMode, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.propertypatternvalidator import PropertyPatterns, PropertyPatternValidator
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    pattern_kgtk_file: Path = KGTKArgumentParser.get_input_file(
        pattern_file, default_stdin=False)
    output_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(reject_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--pattern-file=%s" % str(pattern_kgtk_file), file=error_file)
        if output_kgtk_file is not None:
            print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        print("--presorted=%s" % str(grouped_input))
        print("--reject-node1-groups=%s" % str(reject_node1_groups))
        print("--complain-immediately=%s" % str(complain_immediately))
        print("--add-isa-column=%s" % str(add_isa_column))
        print("--isa-column-name=%s" % str(isa_column_name))
        print("--autovalidate=%s" % str(autovalidate))
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if verbose:
        print("Reading data from '%s'" % str(input_kgtk_file),
              file=error_file,
              flush=True)
        print("Reading patterns from '%s'" % str(pattern_kgtk_file),
              file=error_file,
              flush=True)
        if output_kgtk_file is not None:
            print("Writing good data to '%s'" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        if reject_kgtk_file is not None:
            print("Writing rejected data to '%s'" % str(reject_kgtk_file),
                  file=error_file,
                  flush=True)

    try:
        pkr: KgtkReader = KgtkReader.open(pattern_kgtk_file,
                                          error_file=error_file,
                                          mode=KgtkReaderMode.EDGE,
                                          options=reader_options,
                                          value_options=value_options,
                                          verbose=verbose,
                                          very_verbose=very_verbose)

        pps: PropertyPatterns = PropertyPatterns.load(
            pkr,
            value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kr: KgtkReader = KgtkReader.open(input_kgtk_file,
                                         error_file=error_file,
                                         options=reader_options,
                                         value_options=value_options,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        output_column_names: typing.List[str] = []
        isa_column_idx: int = -1
        if output_kgtk_file is not None:
            output_column_names = kr.column_names.copy()
            if add_isa_column:
                if isa_column_name in output_column_names:
                    isa_column_idx = output_column_names.index(isa_column_name)
                else:
                    isa_column_idx = len(output_column_names)
                    output_column_names.append(isa_column_name)

        ppv: PropertyPatternValidator = PropertyPatternValidator.new(
            pps,
            kr,
            grouped_input=grouped_input,
            reject_node1_groups=reject_node1_groups,
            no_complaints=no_complaints,
            complain_immediately=complain_immediately,
            isa_column_idx=isa_column_idx,
            autovalidate=autovalidate,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kw: typing.Optional[KgtkWriter] = None
        if output_kgtk_file is not None:
            kw = KgtkWriter.open(output_column_names,
                                 output_kgtk_file,
                                 verbose=verbose,
                                 very_verbose=very_verbose)

        rkw: typing.Optional[KgtkWriter] = None
        if reject_kgtk_file is not None:
            rkw = KgtkWriter.open(output_column_names,
                                  reject_kgtk_file,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        ppv.process(kr, kw, rkw)

        if verbose:
            print("Read %d rows, %d valid" %
                  (ppv.input_row_count, ppv.valid_row_count),
                  file=error_file,
                  flush=True)
            if kw is not None:
                print("Wrote %d good rows" % ppv.output_row_count,
                      file=error_file,
                      flush=True)
            if rkw is not None:
                print("Wrote %d rejected rows" % ppv.reject_row_count,
                      file=error_file,
                      flush=True)

        if kw is not None:
            kw.close()
        if rkw is not None:
            rkw.close()

        return 0

    except Exception as e:
        raise KGTKException(e)
コード例 #27
0
ファイル: split.py プロジェクト: vishalbelsare/kgtk
def run(input_file: KGTKFiles,
        output_path: str,
        file_prefix: str,
        split_by_qnode: bool,
        lines: int,
        gzipped_output: bool,
        errors_to_stdout: bool = False,
        **kwargs) -> int:
    import sys
    from pathlib import Path
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    def write_files(error_file, file_number, file_prefix, kr, lines_to_write,
                    output_path, Qnode, reader_options, split_by_qnode,
                    suffix):
        if split_by_qnode:
            output_kgtk_file = Path(f'{output_path}/{Qnode}{suffix}')
        else:
            output_kgtk_file = Path(
                f'{output_path}/{file_prefix}{file_number}{suffix}')
        kw = KgtkWriter.open(
            kr.column_names,
            output_kgtk_file,
            mode=KgtkWriter.Mode[kr.mode.name],
            use_mgzip=reader_options.use_mgzip,  # Hack!
            mgzip_threads=reader_options.mgzip_threads,  # Hack!
            error_file=error_file,
            verbose=False,
            very_verbose=False)
        for r in lines_to_write:
            kw.write(r)
        kw.close()

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)

    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr
    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    suffix = ".tsv.gz" if gzipped_output else ".tsv"

    kr: KgtkReader = KgtkReader.open(
        input_kgtk_file,
        options=reader_options,
        value_options=value_options,
        error_file=error_file,
        verbose=False,
        very_verbose=False,
    )

    node1_idx: int = kr.get_node1_column_index()
    label_idx: int = kr.get_label_column_index()
    node2_idx: int = kr.get_node2_column_index()

    if node1_idx < 0 or label_idx < 0 or node2_idx < 0:
        print(
            "Error: Not a valid file: {}. A valid edge file should have these columns: node1, label and node2"
            .format(input_file),
            file=error_file,
            flush=True)
        kr.close()
        raise KGTKException("Missing columns.")

    prev = None
    lines_to_write = list()
    file_number = 0

    for row in kr:
        node = row[node1_idx]
        if node.startswith('Q') or node.startswith('P'):
            if prev is None:
                prev = node

            if not are_nodes_equal(prev, node):
                if split_by_qnode or len(lines_to_write) >= lines:
                    write_files(error_file, file_number, file_prefix, kr,
                                lines_to_write, output_path, prev,
                                reader_options, split_by_qnode, suffix)

                    lines_to_write = list()
                    file_number += 1

                prev = node

            lines_to_write.append(row)

    if len(lines_to_write) > 0:
        write_files(error_file, file_number, file_prefix, kr, lines_to_write,
                    output_path, prev, reader_options, split_by_qnode, suffix)
        return 0
コード例 #28
0
    def python_sort():
        if verbose:
            print("Opening the input file: %s" % str(input_path),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_path,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sort_idx: int
        key_idxs: typing.List[int] = []
        if columns is not None and len(columns) > 0:
            # Process the list of column names, including splitting
            # comma-separated lists of column names.
            column_name: str
            for column_name in columns:
                column_name_2: str
                for column_name_2 in column_name.split(","):
                    column_name_2 = column_name_2.strip()
                    if len(column_name_2) == 0:
                        continue
                    if column_name_2.isdigit():
                        sort_idx = int(column_name_2)
                        if sort_idx > len(kr.column_names):
                            kr.close()
                            cleanup()
                            raise KGTKException(
                                "Invalid column number %d (max %d)." %
                                (sort_idx, len(kr.column_names)))
                        key_idxs.append(sort_idx - 1)
                    else:
                        if column_name_2 not in kr.column_names:
                            kr.close()
                            cleanup()
                            raise KGTKException("Unknown column_name %s" %
                                                column_name_2)
                        key_idxs.append(kr.column_name_map[column_name_2])
        else:
            if kr.is_node_file:
                key_idxs.append(kr.id_column_idx)

            elif kr.is_edge_file:
                if kr.id_column_idx >= 0:
                    key_idxs.append(kr.id_column_idx)

                key_idxs.append(kr.node1_column_idx)
                key_idxs.append(kr.label_column_idx)
                key_idxs.append(kr.node2_column_idx)
            else:
                cleanup()
                raise KGTKException(
                    "Unknown KGTK file mode, please specify the sorting columns."
                )

        if verbose:
            print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]),
                  file=error_file,
                  flush=True)

        lines: typing.MutableMapping[str, typing.List[str]] = dict()

        progress_startup()
        key: str
        row: typing.List[str]
        for row in kr:
            key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx]
                                                      for idx in key_idxs)
            lines[key] = row
        if verbose:
            print("\nRead %d data lines." % len(lines),
                  file=error_file,
                  flush=True)

        kw = KgtkWriter.open(kr.column_names,
                             output_path,
                             mode=KgtkWriter.Mode[kr.mode.name],
                             verbose=verbose,
                             very_verbose=very_verbose)
        for key in sorted(lines.keys()):
            kw.write(lines[key])
        kw.close()
        kr.close()
コード例 #29
0
ファイル: kgtkifexists.py プロジェクト: yyht/kgtk
    def process(self):
        UPDATE_VERSION: str = "2020-08-24T21:47:20.256050+00:00#mr0wtMHlN/QaplDsGc/ylG3Hw5stsjziykzuGlSHBSion4xoW/Bec0sn55IQ7wFWBUClRS7g1tbAuaqEduhUVA=="
        if self.show_version or self.verbose:
            print("KgtkIfEfexists version: %s" % UPDATE_VERSION, file=self.error_file, flush=True)

        # Open the input files once.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True)
            else:
                print("Reading the input data from stdin", file=self.error_file, flush=True)

        input_kr: KgtkReader =  KgtkReader.open(self.input_file_path,
                                                error_file=self.error_file,
                                                who="input",
                                                options=self.input_reader_options,
                                                value_options = self.value_options,
                                                verbose=self.verbose,
                                                very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Opening the filter input file: %s" % self.filter_file_path, file=self.error_file, flush=True)
        filter_kr: KgtkReader = KgtkReader.open(self.filter_file_path,
                                                who="filter",
                                                error_file=self.error_file,
                                                options=self.filter_reader_options,
                                                value_options=self.value_options,
                                                verbose=self.verbose,
                                                very_verbose=self.very_verbose,
        )

        input_key_columns: typing.List[int] = self.get_key_columns(self.input_keys, input_kr, filter_kr, "input")
        filter_key_columns: typing.List[int] = self.get_key_columns(self.filter_keys, filter_kr, input_kr, "filter")

        if len(input_key_columns) != len(filter_key_columns):
            print("There are %d input key columns but %d filter key columns.  Exiting." % (len(input_key_columns), len(filter_key_columns)),
                  file=self.error_file, flush=True)
            return

        ew: typing.Optional[KgtkWriter] = None
        if self.output_file_path is not None:
            if self.verbose:
                print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True)
            ew = KgtkWriter.open(input_kr.column_names,
                                 self.output_file_path,
                                 mode=input_kr.mode,
                                 require_all_columns=False,
                                 prohibit_extra_columns=True,
                                 fill_missing_columns=True,
                                 gzip_in_parallel=False,
                                 verbose=self.verbose,
                                 very_verbose=self.very_verbose)
            
        rew: typing.Optional[KgtkWriter] = None
        if self.reject_file_path is not None:
            if self.verbose:
                print("Opening the reject file: %s" % self.reject_file_path, file=self.error_file, flush=True)
            rew = KgtkWriter.open(input_kr.column_names,
                                  self.reject_file_path,
                                  mode=input_kr.mode,
                                  require_all_columns=False,
                                  prohibit_extra_columns=True,
                                  fill_missing_columns=True,
                                  gzip_in_parallel=False,
                                  verbose=self.verbose,
                                  very_verbose=self.very_verbose)
            
        if self.cache_input:
            if self.preserve_order:
                self.process_cacheing_input_preserving_order(input_kr=input_kr,
                                                             filter_kr=filter_kr,
                                                             input_key_columns=input_key_columns,
                                                             filter_key_columns=filter_key_columns,
                                                             ew=ew,
                                                             rew=rew)
            else:
                self.process_cacheing_input(input_kr=input_kr,
                                            filter_kr=filter_kr,
                                            input_key_columns=input_key_columns,
                                            filter_key_columns=filter_key_columns,
                                            ew=ew,
                                            rew=rew)
        else:
            self.process_cacheing_filter(input_kr=input_kr,
                                         filter_kr=filter_kr,
                                         input_key_columns=input_key_columns,
                                         filter_key_columns=filter_key_columns,
                                         ew=ew,
                                         rew=rew)

        if ew is not None:
            ew.close()
        if rew is not None:
            rew.close()
コード例 #30
0
ファイル: text_embedding.py プロジェクト: vishalbelsare/kgtk
def load_property_labels_file(
    input_files: typing.List[str],
    error_file: typing.TextIO,
    reader_options: KgtkReaderOptions,
    value_options: KgtkValueOptions,
    label_filter: typing.List[str],
    verbose: bool = False,
):
    labels_dict: typing.MutableMapping[str, str] = {}
    for each_file in input_files:
        kr: KgtkReader = KgtkReader.open(
            Path(each_file),
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
        )
        fail: bool = False
        if kr.node1_column_idx < 0:
            fail = True
            print("Cannot determine which column is node1 in %s" % each_file,
                  file=error_file,
                  flush=True)
        if len(label_filter) > 0 and kr.label_column_idx < 0:
            fail = True
            print("Cannot determine which column is label in %s" % each_file,
                  file=error_file,
                  flush=True)
        if kr.node2_column_idx < 0:
            fail = True
            print("Cannot determine which column is node2 in %s" % each_file,
                  file=error_file,
                  flush=True)
        if fail:
            raise KGTKException("Cannot identify a required column in %s" %
                                each_file)

        row: typing.List[str]
        for row in kr:
            if len(label_filter) > 0:
                if row[kr.label_column_idx] not in label_filter:
                    continue

            node_id: str = row[kr.node1_column_idx]
            node_label: str = row[kr.node2_column_idx]
            text: str
            language: str
            language_suffix: str
            if node_label.startswith(("'", '"')):
                text, language, language_suffix = KgtkFormat.destringify(
                    node_label)
            else:
                text = node_label
                language = ""
                language_suffix = ""

            # The following code will take the last-read English label,
            # otherwise, the first-read non-English label.
            if language == "en" and language_suffix == "":
                labels_dict[node_id] = text
            else:
                if node_id not in labels_dict:
                    labels_dict[node_id] = node_label

        kr.close()
    return labels_dict