Example #1
0
def generate_kgtk_output(entities_output,output_kgtk_file,output_no_header,verbose,very_verbose):

    # Open the output file.
    kw: KgtkWriter = KgtkWriter.open(#kr.column_names,
                                    ['node1', 'label', 'node2'],
                                    output_kgtk_file,
                                    #mode=KgtkWriter.Mode[kr.mode.name],
                                    mode = KgtkWriter.Mode.AUTO,
                                    require_all_columns=False,
                                    prohibit_extra_columns=False,
                                    fill_missing_columns=False,
                                    gzip_in_parallel=False,
                                    no_header=output_no_header,
                                    verbose=verbose,
                                    very_verbose=very_verbose)

    input_line_count: int = 0
    if verbose:
        logging.info("Processing the input records.", file=self.error_file, flush=True)

    MODULE_NAME = 'graph_embeddings' # __name__.split('.')[-1] 
    with open(entities_output) as wv_file:
        for line in wv_file:
            line = line.replace('\n','')   #remove  \n
            entity_name = line.split('\t')[0]
            entity_vev = ','.join(line.split('\t')[1:])
            input_line_count += 1
            kw.write([entity_name,MODULE_NAME,entity_vev]) 
             
    if verbose:
        logging.info("Processed %d records." % (input_line_count), file=self.error_file, flush=True)

    kw.close()
Example #2
0
    def write_updated_namespace_file(self):
        # Is there an updated namespaces file?
        if self.updated_namespace_file_path is None:
            return

        if self.verbose:
            print("Opening updated namespaces file %s" %
                  str(self.updated_namespace_file_path),
                  file=self.error_file,
                  flush=True)
        # Open the updated namespaces file.
        un: KgtkWriter = KgtkWriter.open(self.COLUMN_NAMES,
                                         self.updated_namespace_file_path,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)
        namespace_id: str
        for namespace_id in sorted(self.namespace_ids.keys()):
            un.write([
                namespace_id, self.prefix_expansion_label,
                '"' + self.namespace_ids[namespace_id] + '"'
            ])
        un.close()
Example #3
0
    def process(self):
        # Open the input file.
        if self.verbose:
            print("Opening the input file: %s" % str(self.input_file_path),
                  file=self.error_file,
                  flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Opening the output file: %s" % str(self.output_file_path),
                  file=self.error_file,
                  flush=True)

        # Open the output file.
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)
        # here kw has one line already where PBG doesn't need it,

        input_line_count: int = 0
        if self.verbose:
            print("Processing the input records.",
                  file=self.error_file,
                  flush=True)

        # node1 relation node2
        node1_index = kr.get_node1_column_index()
        node2_index = kr.get_node2_column_index()
        relation_index = kr.get_id_column_index('relation')

        row: typing.List[str]
        # delete header
        kw.file_out.seek(0)  # set the cursor to the top of the file
        kw.file_out.truncate()  # truncate following part == delete first line
        # print(kw.file_out.tell())

        for row in kr:
            input_line_count += 1
            kw.write([row[node1_index], row[relation_index], row[node2_index]])

        if self.verbose:
            print("Processed %d records." % (input_line_count),
                  file=self.error_file,
                  flush=True)

        kw.close()
Example #4
0
    def process(self):
        input_kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            who="input",
            options=self.input_reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        input_key_columns: typing.List[int] = self.get_key_columns(
            input_kr, "input")
        label_col_idx = input_key_columns[1]
        label = '{}{}'.format('c', label_col_idx)

        g = load_graph_from_csv(str(input_kr.file_path),
                                not (self.undirected),
                                skip_first=not (self.no_header),
                                hashed=True,
                                csv_options={'delimiter': '\t'},
                                ecols=(input_key_columns[0],
                                       input_key_columns[2]))

        es = []
        header = ['node1', 'label', 'node2']
        if self.properties:
            properties = self.properties.split(',')
            for e in properties:
                es += (find_edge(g, g.edge_properties[label], e))
            g.clear_edges()
            g.add_edge_list(list(set(es)))
        comp, hist = label_components(g, directed=self.strong)

        ew: KgtkWriter = KgtkWriter.open(header,
                                         self.output_file_path,
                                         mode=input_kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)
        for v, c in enumerate(comp):
            ew.write([
                g.vertex_properties['name'][v], 'connected_component',
                str(c)
            ])
Example #5
0
 def write_files(error_file, file_number, file_prefix, kr, lines_to_write, output_path, Qnode, reader_options,
                 split_by_qnode, suffix):
     if split_by_qnode:
         output_kgtk_file = Path(f'{output_path}/{Qnode}{suffix}')
     else:
         output_kgtk_file = Path(f'{output_path}/{file_prefix}{file_number}{suffix}')
     kw = KgtkWriter.open(kr.column_names,
                          output_kgtk_file,
                          mode=KgtkWriter.Mode[kr.mode.name],
                          use_mgzip=reader_options.use_mgzip,  # Hack!
                          mgzip_threads=reader_options.mgzip_threads,  # Hack!
                          error_file=error_file,
                          verbose=False,
                          very_verbose=False)
     for r in lines_to_write:
         kw.write(r)
     kw.close()
Example #6
0
def generate_kgtk_output(entities_output, output_kgtk_file, verbose,
                         very_verbose):

    # Open the output file.
    kw: KgtkWriter = KgtkWriter.open(  #kr.column_names,
        ['id', 'node1', 'node2', 'relation'
         ],  # in order to obey the kgtk rules
        output_kgtk_file,
        #mode=KgtkWriter.Mode[kr.mode.name],
        mode=KgtkWriter.Mode.AUTO,
        require_all_columns=False,
        prohibit_extra_columns=False,
        fill_missing_columns=False,
        gzip_in_parallel=False,
        verbose=verbose,
        very_verbose=very_verbose)

    input_line_count: int = 0
    if verbose:
        logging.info("Processing the input records.",
                     file=self.error_file,
                     flush=True)

    #delete header
    kw.file_out.seek(0)  # set the cursor to the top of the file
    kw.file_out.truncate()  # truncate following part == delete first line

    MODULE_NAME = 'graph_embeddings'  # __name__.split('.')[-1]
    with open(entities_output) as wv_file:
        for line in wv_file:
            line = line.replace('\n', '')  #remove  \n
            entity_name = line.split('\t')[0]
            entity_vev = ','.join(line.split('\t')[1:])
            input_line_count += 1
            kw.write([entity_name, MODULE_NAME, entity_vev])

    if verbose:
        logging.info("Processed %d records." % (input_line_count),
                     file=self.error_file,
                     flush=True)

    kw.close()
Example #7
0
    def process(self):
        # Open the input file.
        if self.verbose:
            print("Opening the input file: %s" % str(self.input_file_path), file=self.error_file, flush=True)

        kr: KgtkReader =  KgtkReader.open(self.input_file_path,
                                          error_file=self.error_file,
                                          options=self.reader_options,
                                          value_options = self.value_options,
                                          verbose=self.verbose,
                                          very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Opening the output file: %s" % str(self.output_file_path), file=self.error_file, flush=True)
        # Open the output file.
        kw: KgtkWriter = KgtkWriter.open(kr.column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        input_line_count: int = 0

        if self.verbose:
            print("Processing the input records.", file=self.error_file, flush=True)

        row: typing.List[str]
        for row in kr:
            input_line_count += 1
            kw.write(row)

        if self.verbose:
            print("Processed %d records." % (input_line_count), file=self.error_file, flush=True)
        
        kw.close()
Example #8
0
    def open_output_writer(
        self, ikr: KgtkReader, lifted_column_idxs: typing.List[int]
    ) -> typing.Tuple[KgtkWriter, typing.List[int]]:
        # Build the output column names.
        output_column_names: typing.List[str]
        lifted_output_column_idxs: typing.List[int]
        output_column_names, lifted_output_column_idxs = self.build_output_column_names(
            ikr, lifted_column_idxs)

        if self.verbose:
            print("Opening the output file: %s" % self.output_file_path,
                  file=self.error_file,
                  flush=True)
        ew: KgtkWriter = KgtkWriter.open(output_column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode[ikr.mode.name],
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        return ew, lifted_output_column_idxs
Example #9
0
    def python_sort():
        if verbose:
            print("Opening the input file: %s" % str(input_path),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_path,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sort_idx: int
        key_idxs: typing.List[int] = []
        if columns is not None and len(columns) > 0:
            # Process the list of column names, including splitting
            # comma-separated lists of column names.
            column_name: str
            for column_name in columns:
                column_name_2: str
                for column_name_2 in column_name.split(","):
                    column_name_2 = column_name_2.strip()
                    if len(column_name_2) == 0:
                        continue
                    if column_name_2.isdigit():
                        sort_idx = int(column_name_2)
                        if sort_idx > len(kr.column_names):
                            kr.close()
                            cleanup()
                            raise KGTKException(
                                "Invalid column number %d (max %d)." %
                                (sort_idx, len(kr.column_names)))
                        key_idxs.append(sort_idx - 1)
                    else:
                        if column_name_2 not in kr.column_names:
                            kr.close()
                            cleanup()
                            raise KGTKException("Unknown column_name %s" %
                                                column_name_2)
                        key_idxs.append(kr.column_name_map[column_name_2])
        else:
            if kr.is_node_file:
                key_idxs.append(kr.id_column_idx)

            elif kr.is_edge_file:
                if kr.id_column_idx >= 0:
                    key_idxs.append(kr.id_column_idx)

                key_idxs.append(kr.node1_column_idx)
                key_idxs.append(kr.label_column_idx)
                key_idxs.append(kr.node2_column_idx)
            else:
                cleanup()
                raise KGTKException(
                    "Unknown KGTK file mode, please specify the sorting columns."
                )

        if verbose:
            print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]),
                  file=error_file,
                  flush=True)

        lines: typing.MutableMapping[str, typing.List[str]] = dict()

        progress_startup()
        key: str
        row: typing.List[str]
        for row in kr:
            key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx]
                                                      for idx in key_idxs)
            lines[key] = row
        if verbose:
            print("\nRead %d data lines." % len(lines),
                  file=error_file,
                  flush=True)

        kw = KgtkWriter.open(kr.column_names,
                             output_path,
                             mode=KgtkWriter.Mode[kr.mode.name],
                             verbose=verbose,
                             very_verbose=very_verbose)
        for key in sorted(lines.keys()):
            kw.write(lines[key])
        kw.close()
        kr.close()
Example #10
0
def run(
        input_file: KGTKFiles,
        pattern_file: KGTKFiles,
        output_file: KGTKFiles,
        reject_file: KGTKFiles,
        grouped_input: bool = False,
        reject_node1_groups: bool = False,
        no_complaints: bool = False,
        complain_immediately: bool = False,
        add_isa_column: bool = False,
        isa_column_name: str = "isa;node2",
        autovalidate: bool = True,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = False,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkReaderOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderMode, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.propertypatternvalidator import PropertyPatterns, PropertyPatternValidator
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    pattern_kgtk_file: Path = KGTKArgumentParser.get_input_file(
        pattern_file, default_stdin=False)
    output_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(output_file)
    reject_kgtk_file: typing.Optional[
        Path] = KGTKArgumentParser.get_optional_output_file(reject_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--pattern-file=%s" % str(pattern_kgtk_file), file=error_file)
        if output_kgtk_file is not None:
            print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if reject_kgtk_file is not None:
            print("--reject-file=%s" % str(reject_kgtk_file), file=error_file)
        print("--presorted=%s" % str(grouped_input))
        print("--reject-node1-groups=%s" % str(reject_node1_groups))
        print("--complain-immediately=%s" % str(complain_immediately))
        print("--add-isa-column=%s" % str(add_isa_column))
        print("--isa-column-name=%s" % str(isa_column_name))
        print("--autovalidate=%s" % str(autovalidate))
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if verbose:
        print("Reading data from '%s'" % str(input_kgtk_file),
              file=error_file,
              flush=True)
        print("Reading patterns from '%s'" % str(pattern_kgtk_file),
              file=error_file,
              flush=True)
        if output_kgtk_file is not None:
            print("Writing good data to '%s'" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        if reject_kgtk_file is not None:
            print("Writing rejected data to '%s'" % str(reject_kgtk_file),
                  file=error_file,
                  flush=True)

    try:
        pkr: KgtkReader = KgtkReader.open(pattern_kgtk_file,
                                          error_file=error_file,
                                          mode=KgtkReaderMode.EDGE,
                                          options=reader_options,
                                          value_options=value_options,
                                          verbose=verbose,
                                          very_verbose=very_verbose)

        pps: PropertyPatterns = PropertyPatterns.load(
            pkr,
            value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kr: KgtkReader = KgtkReader.open(input_kgtk_file,
                                         error_file=error_file,
                                         options=reader_options,
                                         value_options=value_options,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        output_column_names: typing.List[str] = []
        isa_column_idx: int = -1
        if output_kgtk_file is not None:
            output_column_names = kr.column_names.copy()
            if add_isa_column:
                if isa_column_name in output_column_names:
                    isa_column_idx = output_column_names.index(isa_column_name)
                else:
                    isa_column_idx = len(output_column_names)
                    output_column_names.append(isa_column_name)

        ppv: PropertyPatternValidator = PropertyPatternValidator.new(
            pps,
            kr,
            grouped_input=grouped_input,
            reject_node1_groups=reject_node1_groups,
            no_complaints=no_complaints,
            complain_immediately=complain_immediately,
            isa_column_idx=isa_column_idx,
            autovalidate=autovalidate,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose)

        kw: typing.Optional[KgtkWriter] = None
        if output_kgtk_file is not None:
            kw = KgtkWriter.open(output_column_names,
                                 output_kgtk_file,
                                 verbose=verbose,
                                 very_verbose=very_verbose)

        rkw: typing.Optional[KgtkWriter] = None
        if reject_kgtk_file is not None:
            rkw = KgtkWriter.open(output_column_names,
                                  reject_kgtk_file,
                                  verbose=verbose,
                                  very_verbose=very_verbose)

        ppv.process(kr, kw, rkw)

        if verbose:
            print("Read %d rows, %d valid" %
                  (ppv.input_row_count, ppv.valid_row_count),
                  file=error_file,
                  flush=True)
            if kw is not None:
                print("Wrote %d good rows" % ppv.output_row_count,
                      file=error_file,
                      flush=True)
            if rkw is not None:
                print("Wrote %d rejected rows" % ppv.reject_row_count,
                      file=error_file,
                      flush=True)

        if kw is not None:
            kw.close()
        if rkw is not None:
            rkw.close()

        return 0

    except Exception as e:
        raise KGTKException(e)
Example #11
0
def run(
        input_file: KGTKFiles,
        path_file: KGTKFiles,
        output_file: KGTKFiles,
        statistics_only: bool,
        undirected: bool,
        max_hops: int,
        source_column_name: typing.Optional[str],
        target_column_name: typing.Optional[str],
        shortest_path: bool,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool.all import find_vertex
    from graph_tool.topology import all_paths
    from graph_tool.topology import all_shortest_paths

    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    from kgtk.exceptions import KGTKException
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="input", fallback=True)
        path_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="path", fallback=True)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        path_kgtk_file: Path = KGTKArgumentParser.get_input_file(path_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        id_col = 'name'

        if verbose:
            print("Reading the path file: %s" % str(path_kgtk_file),
                  file=error_file,
                  flush=True)
        pairs = []
        pkr: KgtkReader = KgtkReader.open(
            path_kgtk_file,
            error_file=error_file,
            options=path_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        path_source_idx: int = pkr.get_node1_column_index(source_column_name)
        if path_source_idx < 0:
            print("Missing node1 (source) column name in the path file.",
                  file=error_file,
                  flush=True)

        path_target_idx: int = pkr.get_node2_column_index(target_column_name)
        if path_target_idx < 0:
            print("Missing node1 (target) column name in the path file.",
                  file=error_file,
                  flush=True)
        if path_source_idx < 0 or path_target_idx < 0:
            pkr.close()
            raise KGTKException("Exiting due to missing columns.")

        paths_read: int = 0
        path_row: typing.List[str]
        for path_row in pkr:
            paths_read += 1
            if len(path_row) != pkr.column_count:
                raise KGTKException(
                    "Exiting because line %d in the path file (%s) is the wrong length: %d columns expected, %d were read."
                    % (paths_read, str(path_kgtk_file), pkr.column_count,
                       len(path_row)))
            src: str = path_row[path_source_idx]
            tgt: str = path_row[path_target_idx]
            pairs.append((src, tgt))
        pkr.close()
        if verbose:
            print("%d path rows read" % paths_read,
                  file=error_file,
                  flush=True)
        if len(pairs) == 0:
            print("No path pairs found, the output will be empty.",
                  file=error_file,
                  flush=True)
        elif verbose:
            print("%d path pairs found" % len(pairs),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Reading the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=input_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sub_index: int = kr.get_node1_column_index()
        if sub_index < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred_index: int = kr.get_label_column_index()
        if pred_index < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj_index: int = kr.get_node2_column_index()
        if obj_index < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        id_index: int = kr.get_id_column_index()
        if id_index < 0:
            print("Missing id column", file=error_file, flush=True)
        if sub_index < 0 or pred_index < 0 or obj_index < 0 or id_index < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred_index]
        id_col_name: str = kr.column_names[id_index]

        G = load_graph_from_kgtk(kr,
                                 directed=not undirected,
                                 ecols=(sub_index, obj_index),
                                 verbose=verbose,
                                 out=error_file)

        output_columns: typing.List[str] = ['node1', 'label', 'node2', 'id']
        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        id_count = 0
        if not statistics_only:
            for e in G.edges():
                sid, oid = e
                lbl = G.ep[predicate][e]
                kw.write([
                    G.vp[id_col][sid], lbl, G.vp[id_col][oid],
                    '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1
            if verbose:
                print("%d edges found." % id_count,
                      file=error_file,
                      flush=True)

        id_count = 0
        path_id = 0
        for pair in pairs:
            source_node, target_node = pair
            source_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=source_node)
            target_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=target_node)
            if len(source_ids) == 1 and len(target_ids) == 1:
                source_id = source_ids[0]
                target_id = target_ids[0]
                if shortest_path:
                    _all_paths = all_shortest_paths(G,
                                                    source_id,
                                                    target_id,
                                                    edges=True)
                else:
                    _all_paths = all_paths(G,
                                           source_id,
                                           target_id,
                                           cutoff=max_hops,
                                           edges=True)

                for path in _all_paths:
                    for edge_num, an_edge in enumerate(path):
                        edge_id = G.properties[('e', 'id')][an_edge]
                        node1: str = 'p%d' % path_id
                        kw.write([
                            node1,
                            str(edge_num), edge_id,
                            '{}-{}-{}'.format(node1, edge_num, id_count)
                        ])
                        id_count += 1
                    path_id += 1

        if verbose:
            print("%d paths contining %d edges found." % (path_id, id_count),
                  file=error_file,
                  flush=True)

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Example #12
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        undirected: bool,
        compute_degrees: bool,
        compute_pagerank: bool,
        compute_hits: bool,
        log_file: str,
        statistics_only: bool,
        vertex_in_degree: str,
        vertex_out_degree: str,
        vertex_pagerank: str,
        vertex_auth: str,
        vertex_hubs: str,
        top_n: int,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool import centrality
    from kgtk.exceptions import KGTKException
    import kgtk.gt.analysis_utils as gtanalysis
    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    v_prop_dict = {
        'vertex_pagerank': vertex_pagerank,
        'vertex_hubs': vertex_hubs,
        'vertex_auth': vertex_auth
    }
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later
        directions = ['in', 'out', 'total']
        id_col = 'name'
        output_columns = ["node1", "label", "node2", "id"]

        if verbose:
            print('loading the KGTK input file...\n',
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        sub: int = kr.get_node1_column_index()
        if sub < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred: int = kr.get_label_column_index()
        if pred < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj: int = kr.get_node2_column_index()
        if obj < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        if sub < 0 or pred < 0 or obj < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred]

        G2 = load_graph_from_kgtk(kr,
                                  directed=not undirected,
                                  ecols=(sub, obj),
                                  verbose=verbose,
                                  out=error_file)
        if verbose:
            print('graph loaded! It has %d nodes and %d edges.' %
                  (G2.num_vertices(), G2.num_edges()),
                  file=error_file,
                  flush=True)

        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        with open(log_file, 'w') as writer:
            writer.write('graph loaded! It has %d nodes and %d edges\n' %
                         (G2.num_vertices(), G2.num_edges()))
            writer.write('\n###Top relations:\n')
            for rel, freq in gtanalysis.get_topN_relations(
                    G2, pred_property=predicate):
                writer.write('%s\t%d\n' % (rel, freq))

            if compute_degrees:
                writer.write('\n###Degrees:\n')
                for direction in directions:
                    degree_data = gtanalysis.compute_node_degree_hist(
                        G2, direction)
                    max_degree = len(degree_data) - 1
                    mean_degree, std_degree = gtanalysis.compute_avg_node_degree(
                        G2, direction)
                    writer.write(
                        '%s degree stats: mean=%f, std=%f, max=%d\n' %
                        (direction, mean_degree, std_degree, max_degree))

            if compute_pagerank:
                writer.write('\n###PageRank\n')
                v_pr = G2.new_vertex_property('float')
                centrality.pagerank(G2, prop=v_pr)
                G2.properties[('v', 'vertex_pagerank')] = v_pr
                writer.write('Max pageranks\n')
                result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank',
                                                     top_n, id_col)
                for n_id, n_label, pr in result:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr))

            if compute_hits:
                writer.write('\n###HITS\n')
                hits_eig, G2.vp['vertex_hubs'], G2.vp[
                    'vertex_auth'] = gtanalysis.compute_hits(G2)
                writer.write('HITS hubs\n')
                main_hubs = gtanalysis.get_topn_indices(
                    G2, 'vertex_hubs', top_n, id_col)
                for n_id, n_label, hubness in main_hubs:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness))
                writer.write('HITS auth\n')
                main_auth = gtanalysis.get_topn_indices(
                    G2, 'vertex_auth', top_n, id_col)
                for n_id, n_label, authority in main_auth:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority))

        id_count = 0
        if not statistics_only:
            for e in G2.edges():
                sid, oid = e
                lbl = G2.ep[predicate][e]
                kw.write([
                    G2.vp[id_col][sid], lbl, G2.vp[id_col][oid],
                    '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1

        id_count = 0
        for v in G2.vertices():
            v_id = G2.vp[id_col][v]
            kw.write([
                v_id, vertex_in_degree,
                str(v.in_degree()), '{}-{}-{}'.format(v_id, vertex_in_degree,
                                                      id_count)
            ])
            id_count += 1
            kw.write([
                v_id, vertex_out_degree,
                str(v.out_degree()), '{}-{}-{}'.format(v_id, vertex_out_degree,
                                                       id_count)
            ])
            id_count += 1

            for vprop in G2.vertex_properties.keys():
                if vprop == id_col:
                    continue
                kw.write([
                    v_id, v_prop_dict[vprop],
                    str(G2.vp[vprop][v]),
                    '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count)
                ])
                id_count += 1

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Example #13
0
    def python_sort():
        if numeric_columns is not None and len(numeric_columns) > 0:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support numeric column sorts.'
            )

        if reverse_columns is not None and len(reverse_columns) > 0:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support reverse column sorts.'
            )

        if verbose:
            print("Opening the input file: %s" % str(input_path),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_path,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sort_idx: int
        key_idxs: typing.List[int] = []
        if columns is not None and len(columns) > 0:
            # Process the list of column names, including splitting
            # comma-separated lists of column names.
            column_name: str
            for column_name in columns:
                column_name_2: str
                for column_name_2 in column_name.split(","):
                    column_name_2 = column_name_2.strip()
                    if len(column_name_2) == 0:
                        continue
                    if column_name_2.isdigit():
                        sort_idx = int(column_name_2)
                        if sort_idx > len(kr.column_names):
                            kr.close()
                            cleanup()
                            raise KGTKException(
                                "Invalid column number %d (max %d)." %
                                (sort_idx, len(kr.column_names)))
                        key_idxs.append(sort_idx - 1)
                    else:
                        if column_name_2 not in kr.column_names:
                            kr.close()
                            cleanup()
                            raise KGTKException("Unknown column_name %s" %
                                                column_name_2)
                        key_idxs.append(kr.column_name_map[column_name_2])
        else:
            if kr.is_node_file:
                key_idxs.append(kr.id_column_idx)

            elif kr.is_edge_file:
                if kr.id_column_idx >= 0:
                    key_idxs.append(kr.id_column_idx)

                key_idxs.append(kr.node1_column_idx)
                key_idxs.append(kr.label_column_idx)
                key_idxs.append(kr.node2_column_idx)
            else:
                cleanup()
                raise KGTKException(
                    "Unknown KGTK file mode, please specify the sorting columns."
                )

        if verbose:
            print("sorting keys: %s" % " ".join([str(x) for x in key_idxs]),
                  file=error_file,
                  flush=True)

        if numeric_sort and len(key_idxs) > 1:
            raise KGTKException(
                'Error: the pure Python sorter does not currently support numeric sorts on multiple columns.'
            )

        lines: typing.MutableMapping[typing.Union[str, float],
                                     typing.List[typing.List[str]]] = dict()

        progress_startup()
        key: typing.Union[str, float]
        row: typing.List[str]
        for row in kr:
            key = KgtkFormat.KEY_FIELD_SEPARATOR.join(row[idx]
                                                      for idx in key_idxs)
            if numeric_sort:
                key = float(key)
            if key in lines:
                # There are multiple rows with the same key.  Make this a stable sort.
                lines[key].append(row)
            else:
                lines[key] = [row]
        if verbose:
            print("\nRead %d data lines." % len(lines),
                  file=error_file,
                  flush=True)

        kw = KgtkWriter.open(kr.column_names,
                             output_path,
                             mode=KgtkWriter.Mode[kr.mode.name],
                             verbose=verbose,
                             very_verbose=very_verbose)

        for key in sorted(lines.keys(), reverse=reverse_sort):
            for row in lines[key]:
                kw.write(row)

        kw.close()
        kr.close()
Example #14
0
    def process(self):
        if len(self.column_name) == 0:
            raise ValueError("The name of the column to explode is empty.")

        selected_field_names: typing.List[str] = []
        field_name: str

        if self.type_names is not None:
            if self.verbose:
                print("Validate the names of the data types to extract.",
                      file=self.error_file,
                      flush=True)
            type_name: str
            for type_name in self.type_names:
                if type_name not in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS:
                    raise ValueError("Unknown data type name '%s'." %
                                     type_name)
                # Merge this KGTK data type's fields into the list of selected fields:
                for field_name in KgtkValueFields.DEFAULT_DATA_TYPE_FIELDS[
                        type_name]:
                    if field_name not in selected_field_names:
                        selected_field_names.append(field_name)

        if self.field_names is not None:
            # Forget the fields selected above, choose these instead:
            selected_field_names = []
            if self.verbose:
                print("Validate the names of the fields to extract.",
                      file=self.error_file,
                      flush=True)
            for field_name in self.field_names:
                if field_name not in KgtkValueFields.FIELD_NAMES:
                    raise ValueError("Unknown field name '%s'." % field_name)
                # Merge this field into the list of selected fields:
                if field_name not in selected_field_names:
                    selected_field_names.append(field_name)

        if len(selected_field_names) == 0:
            raise ValueError("The list of fields to explode is empty.")

        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Check that the source column '%s' is present." %
                  self.column_name,
                  file=self.error_file,
                  flush=True)
        if self.column_name not in kr.column_name_map:
            raise ValueError("Column name '%s' not found in the input file." %
                             self.column_name)
        column_idx: int = kr.column_name_map[self.column_name]

        if self.verbose:
            print(
                "Build the map of exploded columns and list of new column names",
                file=self.error_file,
                flush=True)
        explosion: typing.MutableMapping[str, idx] = {}
        column_names: typing.List[str] = kr.column_names.copy()
        for field_name in selected_field_names:
            exploded_name: str = self.prefix + field_name
            if self.verbose:
                print("Field '%s' becomes '%s'" % (field_name, exploded_name),
                      file=self.error_file,
                      flush=True)
            if exploded_name in explosion:
                raise ValueError(
                    "Field name '%s' is duplicated in the field list.")
            if exploded_name in kr.column_names:
                if self.overwrite_columns:
                    existing_idx = kr.column_name_map[exploded_name]
                    explosion[field_name] = existing_idx
                    if self.verbose:
                        print(
                            "Field '%s' is overwriting existing column '%s' (idx=%d)"
                            % (field_name, exploded_name, existing_idx),
                            file=self.error_file,
                            flush=True)
                else:
                    raise ValueError(
                        "Exploded column '%s' already exists and not allowed to overwrite"
                        % exploded_name)
            else:
                column_names.append(exploded_name)
                exploded_idx: int = len(column_names) - 1
                explosion[field_name] = exploded_idx
                if self.verbose:
                    print("Field '%s' becomes new column '%s' (idx=%d)" %
                          (field_name, exploded_name, exploded_idx),
                          file=self.error_file,
                          flush=True)
        new_column_count: int = len(column_names) - kr.column_count
        if self.verbose:
            print("%d columns + %d columns = %d columns" %
                  (kr.column_count, new_column_count, len(column_names)))
            print("Explosion length: %d" % len(explosion))

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(column_names,
                                         self.output_file_path,
                                         mode=kr.mode,
                                         output_format=self.output_format,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        if self.verbose:
            print("Expanding records from %s" % self.input_file_path,
                  file=self.error_file,
                  flush=True)
        input_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            # Parse the value for the colummn being exploded:
            item_to_explode: str = row[column_idx]
            value: KgtkValue = KgtkValue(item_to_explode,
                                         options=self.value_options,
                                         parse_fields=True)
            value.validate()
            if not value.is_valid():
                if self.verbose:
                    print("Not exploding invalid item '%s' in input line %d" %
                          (item_to_explode, input_line_count),
                          file=self.error_file,
                          flush=True)
                ew.write(row)  # This will be filled to the proper length
                output_line_count += 1
                continue

            if self.expand_list and value.is_list():
                if self.verbose:
                    print("Expanding a list: '%s'" % item_to_explode,
                          file=self.error_file,
                          flush=True)
                subvalue: KgtkValue
                for subvalue in value.get_list_items():
                    if self.very_verbose:
                        print("Exploding '%s'" % subvalue.value)
                    ew.write(
                        self.explode(subvalue, row, explosion,
                                     new_column_count))
                    output_line_count += 1
            else:
                if self.very_verbose:
                    print("Exploding '%s'" % value.value)
                ew.write(self.explode(value, row, explosion, new_column_count))
                output_line_count += 1

        if self.verbose:
            print("Read %d records, wrote %d records." %
                  (input_line_count, output_line_count),
                  file=self.error_file,
                  flush=True)

        ew.close()
Example #15
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        columns: typing.Optional[typing.List[str]],
        split_on_commas: bool,
        split_on_spaces: bool,
        strip_spaces: bool,
        all_except: bool,
        ignore_missing_columns: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        if columns is not None:
            print("--columns=%s" % " ".join(columns), file=error_file)
        print("--split-on-commas=%s" % str(split_on_commas), file=error_file)
        print("--split-on-spaces=%s" % str(split_on_spaces), file=error_file)
        print("--strip-spaces=%s" % str(strip_spaces), file=error_file)
        print("--all-except=%s" % str(all_except), file=error_file)
        print("--ignore-missing-columns=%s" % str(ignore_missing_columns),
              file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        if columns is None:
            columns = []  # This simplifies matters.

        if split_on_spaces:
            # We will be very lenient, and allow space-seperated arguments
            # *inside* shell quoting, e.g.
            #
            # kgtk remove_columns -c 'name name2 name3'
            #
            # Do not enable this option if spaces are legal inside your
            # column names.
            columns = " ".join(columns).split()
        remove_columns: typing.List[str] = []
        arg: str
        column_name: str
        for arg in columns:
            if split_on_commas:
                for column_name in arg.split(","):
                    if strip_spaces:
                        column_name = column_name.strip()
                    if len(column_name) > 0:
                        remove_columns.append(column_name)
            else:
                if strip_spaces:
                    arg = arg.strip()
                if len(arg) > 0:
                    remove_columns.append(arg)
        if verbose:
            if all_except:
                print("Removing all columns except %d columns: %s" %
                      (len(remove_columns), " ".join(remove_columns)),
                      file=error_file,
                      flush=True)
            else:
                print("Removing %d columns: %s" %
                      (len(remove_columns), " ".join(remove_columns)),
                      file=error_file,
                      flush=True)
        if len(remove_columns) == 0:
            raise KGTKException("No columns to remove")

        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        output_column_names: typing.List[str]

        trouble_column_names: typing.List[str] = []
        if all_except:
            if not ignore_missing_columns:
                for column_name in remove_columns:
                    if column_name not in kr.column_names:
                        print("Error: cannot retain unknown column '%s'." %
                              column_name,
                              file=error_file,
                              flush=True)
                        trouble_column_names.append(column_name)

            output_column_names = []
            for column_name in kr.column_names:
                if column_name in remove_columns:
                    output_column_names.append(column_name)

        else:
            output_column_names = kr.column_names.copy()
            for column_name in remove_columns:
                if column_name in output_column_names:
                    output_column_names.remove(column_name)

                elif not ignore_missing_columns:
                    print("Error: cannot remove unknown column '%s'." %
                          column_name,
                          file=error_file,
                          flush=True)
                    trouble_column_names.append(column_name)

        if len(trouble_column_names) > 0:
            raise KGTKException("Unknown columns %s" %
                                " ".join(trouble_column_names))

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(output_column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        shuffle_list: typing.List[int] = kw.build_shuffle_list(kr.column_names)

        input_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1
            kw.write(row, shuffle_list=shuffle_list)

        if verbose:
            print("Processed %d rows." % (input_line_count),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1
Example #16
0
def run(input_file: KGTKFiles,
        entity_label_files: KGTKFiles,
        output_file: KGTKFiles,

        label_properties: typing.Optional[typing.List[str]],
        description_properties: typing.Optional[typing.List[str]],
        isa_properties: typing.Optional[typing.List[str]],
        has_properties: typing.Optional[typing.List[str]],
        property_values: typing.Optional[typing.List[str]],
        sentence_label: str,
        explain: bool,
        presorted: bool,
        add_entity_labels_from_input: bool,

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
        
    # import modules locally
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException

    from kgtk.gt.lexicalize_utils import Lexicalize

    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    entity_label_kgtk_files: typing.List[Path] = KGTKArgumentParser.get_input_file_list(entity_label_files,
                                                                                        who="The entity label file(s)",
                                                                                        default_stdin=False)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    if label_properties is None:
        label_properties = DEFAULT_LABEL_PROPERTIES

    if description_properties is None:
        description_properties = DEFAULT_DESCRIPTION_PROPERTIES

    if isa_properties is None:
        isa_properties = DEFAULT_ISA_PROPERTIES

    if has_properties is None:
        has_properties = DEFAULT_HAS_PROPERTIES

    if property_values is None:
        property_values = DEFAULT_PROPERTY_VALUES

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file, flush=True)
        if len(entity_label_kgtk_files) > 0:
            print("--entity-label-files %s" % " ".join([str(f) for f in entity_label_kgtk_files]), file=error_file, flush=True)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file, flush=True)

        if len(label_properties) > 0:
            print("--label-properties %s" % " ".join(label_properties), file=error_file, flush=True)

        if len(description_properties) > 0:
            print("--description-properties %s" % " ".join(description_properties), file=error_file, flush=True)

        if len(isa_properties) > 0:
            print("--isa-properties %s" % " ".join(isa_properties), file=error_file, flush=True)

        if len(has_properties) > 0:
            print("--has-properties %s" % " ".join(has_properties), file=error_file, flush=True)

        if len(property_values) > 0:
            print("--property-values %s" % " ".join(property_values), file=error_file, flush=True)

        print("--sentence-label=%s" % str(sentence_label), file=error_file, flush=True)
        print("--explain=%s" % str(explain), file=error_file, flush=True)
        print("--presorted=%s" % str(presorted), file=error_file, flush=True)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)


    lexer: Lexicalize = Lexicalize(label_properties,
                                   description_properties,
                                   isa_properties,
                                   has_properties,
                                   property_values,
                                   sentence_label,
                                   explain=explain,
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)
    if len(entity_label_kgtk_files) > 0:
        lexer.load_entity_label_files(entity_label_kgtk_files,
                                      error_file,
                                      reader_options,
                                      value_options,
                                      label_properties=label_properties,
                                      verbose=verbose)
        
    kr: typing.Optional[KgtkReader] = None
    kw: typing.Optional[KgtkWriter] = None

    try:
        if verbose:
            print("Opening the input file %s" % str(input_kgtk_file), file=error_file, flush=True)
        kr = KgtkReader.open(input_kgtk_file,
                             options=reader_options,
                             value_options = value_options,
                             error_file=error_file,
                             verbose=verbose,
                             very_verbose=very_verbose,
                             )

        if kr.node1_column_idx < 0:
            raise KGTKException("Missing column: node1 or alias")
        if kr.label_column_idx < 0:
            raise KGTKException("Missing column: label or alias")
        if kr.node2_column_idx < 0:
            raise KGTKException("Missing column: node2 or alias")

        if verbose:
            print("node1 column index = {}".format(kr.node1_column_idx),  file=error_file, flush=True)
            print("label column index = {}".format(kr.label_column_idx),  file=error_file, flush=True)
            print("node2 column index = {}".format(kr.node2_column_idx),  file=error_file, flush=True)

        output_columns: typing.List[str] = OUTPUT_COLUMNS.copy()
        if explain:
            output_columns.append("explaination")
            if verbose:
                print("Including an explaination column in the output.", file=error_file, flush=True)

        if verbose:
            print("Opening the output file %s" % str(output_kgtk_file), file=error_file, flush=True)
        kw = KgtkWriter.open(output_columns,
                             output_kgtk_file,
                             require_all_columns=True,
                             prohibit_extra_columns=True,
                             fill_missing_columns=False,
                             gzip_in_parallel=False,
                             verbose=verbose,
                             very_verbose=very_verbose,
                             )

        if presorted:
            lexer.process_presorted_input(kr, kw)
        else:
            lexer.process_unsorted_input(kr, kw, add_entity_labels=add_entity_labels_from_input)

        return 0

    except Exception as e:
        raise KGTKException(str(e))

    finally:
        if kw is not None:
            kw.close()
            
        if kr is not None:
            kr.close()
Example #17
0
    def process(self):
        input_kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            who="input",
            options=self.input_reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        input_key_columns: typing.List[int] = self.get_key_columns(
            input_kr, "input")
        label_col_idx = input_key_columns[1]
        label = input_kr.column_names[label_col_idx]

        g = load_graph_from_kgtk(input_kr, directed=not self.undirected)

        es = []
        header = ['node1', 'label', 'node2']
        if self.properties:
            properties = self.properties.split(',')
            for e in properties:
                es += (find_edge(g, g.edge_properties[label], e))
            g.clear_edges()
            g.add_edge_list(list(set(es)))
        comp, hist = label_components(g, directed=self.strong)

        ew: KgtkWriter = KgtkWriter.open(header,
                                         self.output_file_path,
                                         mode=input_kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        clusters: typing.MutableMapping[str, typing.List[str]] = dict()
        cluster_id: str
        name: str

        v: int
        for v, c in enumerate(comp):
            name = g.vertex_properties['name'][v]
            cluster_id = str(c)
            if cluster_id not in clusters:
                clusters[cluster_id] = [name]
            else:
                clusters[cluster_id].append(name)

        trimmed_clusters: typing.MutableMapping[str, typing.List[str]] = dict()
        for cluster_id in clusters.keys():
            if len(clusters[cluster_id]) >= self.minimum_cluster_size:
                trimmed_clusters[cluster_id] = clusters[cluster_id]

        named_clusters: typing.MutableMapping[
            str, typing.List[str]] = self.name_clusters(trimmed_clusters)
        for cluster_id in sorted(named_clusters.keys()):
            for name in sorted(named_clusters[cluster_id]):
                ew.write([name, 'connected_component', cluster_id])

        ew.close()
Example #18
0
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        mapping_file: KGTKFiles,
        unmodified_edges_file: KGTKFiles,
        activated_mapping_file: KGTKFiles,
        rejected_mapping_file: KGTKFiles,

        confidence_column_name: str,
        require_confidence: bool,
        default_confidence_str: typing.Optional[str],
        confidence_threshold: float,

        same_as_item_label: str,
        same_as_property_label: str,
        allow_exact_duplicates: bool,
        allow_idempotent_mapping: bool,

        split_output_mode: bool,
        modified_pattern: str,

        node1_column_name: typing.Optional[str],
        label_column_name: typing.Optional[str],
        node2_column_name: typing.Optional[str],
        mapping_rule_mode: str,
        mapping_node1_column_name: typing.Optional[str],
        mapping_label_column_name: typing.Optional[str],
        mapping_node2_column_name: typing.Optional[str],

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want.
)->int:
    # import modules locally
    from pathlib import Path
    import sys
    
    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)
    mapping_kgtk_file: Path = KGTKArgumentParser.get_input_file(mapping_file, who="KGTK mappping file")
    unmodified_edges_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(unmodified_edges_file, who="KGTK unmodified edges output file")
    activated_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(activated_mapping_file, who="KGTK activated mapping output file")
    rejected_mapping_kgtk_file: typing.Optional[Path] = KGTKArgumentParser.get_optional_output_file(rejected_mapping_file, who="KGTK rejected mapping output file")

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True)
    mapping_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="mapping", fallback=True)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % repr(str(input_kgtk_file)), file=error_file, flush=True)
        print("--output-file=%s" % repr(str(output_kgtk_file)), file=error_file, flush=True)
        print("--mapping-file=%s" % repr(str(mapping_kgtk_file)), file=error_file, flush=True)
        if unmodified_edges_kgtk_file is not None:
            print("--unmodified-edges-file=%s" % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True)
        if activated_mapping_kgtk_file is not None:
            print("--activated-mapping-edges-file=%s" % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True)
        if rejected_mapping_kgtk_file is not None:
            print("--rejected-mapping-edges-file=%s" % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True)

        print("--confidence-column=%s" % repr(confidence_column_name), file=error_file, flush=True)
        print("--require-confidence=%s" % repr(require_confidence), file=error_file, flush=True)
        if default_confidence_str is not None:
            print("--default-confidence-value=%s" % default_confidence_str, file=error_file, flush=True)
        print("--threshold=%f" % confidence_threshold, file=error_file, flush=True)

        print("--same-as-item-label=%s" % repr(same_as_item_label), file=error_file, flush=True)
        print("--same-as-property-label=%s" % repr(same_as_property_label), file=error_file, flush=True)
        print("--allow-exact-duplicates=%s" % repr(allow_exact_duplicates), file=error_file, flush=True)
        print("--allow-idempotent-actions=%s" % repr(allow_idempotent_mapping), file=error_file, flush=True)

        print("--split-output-mode=%s" % repr(split_output_mode), file=error_file, flush=True)
        print("--modified-pattern=%s" % repr(modified_pattern), file=error_file, flush=True)

        if node1_column_name is not None:
            print("--node1-column-=%s" % repr(node1_column_name), file=error_file, flush=True)
        if label_column_name is not None:
            print("--label-column-=%s" % repr(label_column_name), file=error_file, flush=True)
        if node2_column_name is not None:
            print("--node2-column-=%s" % repr(node2_column_name), file=error_file, flush=True)
        print("--mapping-rule-mode=%s" % repr(mapping_rule_mode), file=error_file, flush=True)
        if mapping_node1_column_name is not None:
            print("--mapping-node1-column-=%s" % repr(mapping_node1_column_name), file=error_file, flush=True)
        if mapping_label_column_name is not None:
            print("--mapping-label-column-=%s" % repr(mapping_label_column_name), file=error_file, flush=True)
        if mapping_node2_column_name is not None:
            print("--mapping-node2-column-=%s" % repr(mapping_node2_column_name), file=error_file, flush=True)

        input_reader_options.show(out=error_file, who="input")
        mapping_reader_options.show(out=error_file, who="mapping")
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    default_confidence_value: typing.Optional[float] = None
    if default_confidence_str is not None:
        try:
            default_confidence_value = float(default_confidence_str)
        except:
            raise KGTKException("--default-confidence-value=%s is invalid" % repr(default_confidence_str))

    try:

        if verbose:
            print("Opening the mapping file %s." % repr(str(mapping_kgtk_file)), file=error_file, flush=True)
        mkr:  KgtkReader = KgtkReader.open(mapping_kgtk_file,
                                           options=mapping_reader_options,
                                           value_options = value_options,
                                           error_file=error_file,
                                           verbose=verbose,
                                           very_verbose=very_verbose,
        )
        trouble = False
        mapping_node1_idx: int = mkr.get_node1_column_index(mapping_node1_column_name)
        mapping_label_idx: int = mkr.get_label_column_index(mapping_label_column_name)
        mapping_node2_idx: int = mkr.get_node2_column_index(mapping_node2_column_name)
        if mapping_node1_idx < 0:
            trouble = True
            print("Error: Cannot find the mapping file node1 column.", file=error_file, flush=True)
        if mapping_label_idx < 0 and mapping_rule_mode == "normal":
            trouble = True
            print("Error: Cannot find the mapping file label column.", file=error_file, flush=True)
        if mapping_node2_idx < 0:
            trouble = True
            print("Error: Cannot find the mapping file node2 column.", file=error_file, flush=True)
        if trouble:
            # Clean up:                                                                                                                                               
            mkr.close()
            raise KGTKException("Missing columns in the mapping file.")
        confidence_column_idx: int = mkr.column_name_map.get(confidence_column_name, -1)
        if require_confidence and confidence_column_idx < 0:
            mkr.close()
            raise KGTKException("The mapping file does not have a confidence column, and confidence is required.")
        
        rmkw: typing.Optional[KgtkWriter] = None
        if rejected_mapping_kgtk_file is not None:
            if verbose:
                print("Opening the rejected mapping edges file %s." % repr(str(rejected_mapping_kgtk_file)), file=error_file, flush=True)
            rmkw = KgtkWriter.open(mkr.column_names,
                                   rejected_mapping_kgtk_file,
                                   mode=KgtkWriter.Mode[mkr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        # Mapping structures:
        item_map: typing.MutableMapping[str, str] = dict()
        item_line_map: typing.MutableMapping[str, int] = dict()
        property_map: typing.MutableMapping[str, str] = dict()
        property_line_map: typing.MutableMapping[str, int] = dict()

        mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict()
        activated_mapping_rows: typing.MutableMapping[int, typing.List[str]] = dict()

        # Read the mapping file.
        if verbose:
            print("Processing the mapping file.", file=error_file, flush=True)
        mapping_confidence_exclusions: int = 0
        mapping_idempotent_exclusions: int = 0
        mapping_errors: int = 0
        mapping_line_number: int = 0
        mrow: typing.List[str]
        for mrow in mkr:
            mapping_line_number += 1
            mapping_node1: str = mrow[mapping_node1_idx]
            mapping_label: str = mrow[mapping_label_idx] if mapping_rule_mode == "normal" else ""
            mapping_node2: str = mrow[mapping_node2_idx]
            mapping_confidence: typing.Optional[float] = default_confidence_value
            if confidence_column_idx >= 0:
                confidence_value_str: str = mrow[confidence_column_idx]
                if len(confidence_value_str) == 0:
                    if require_confidence:
                        print("In line %d of the mapping file: the required confidence value is missing" % (mapping_line_number),
                              file=error_file, flush=True)
                        mapping_errors += 1
                        continue
                else:
                    try:
                        mapping_confidence = float(confidence_value_str)
                    except ValueError:
                        print("In line %d of the mapping file: cannot parse confidence value %s" % (mapping_line_number, repr(mrow[confidence_column_idx])),
                              file=error_file, flush=True)
                        mapping_errors += 1
                        continue
            if mapping_confidence is not None and mapping_confidence < confidence_threshold:
                mapping_confidence_exclusions += 1
                if rmkw is not None:
                    rmkw.write(mrow)
                continue

            if mapping_node1 == mapping_node2 and not allow_idempotent_mapping:
                mapping_idempotent_exclusions += 1
                continue
        
            if mapping_rule_mode == "same-as-item" or mapping_label == same_as_item_label:
                if mapping_node1 in item_map:
                    if mapping_node2 != item_map[mapping_node1] or not allow_exact_duplicates:
                        print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label,
                                                                                                      repr(mapping_node1),
                                                                                                      mapping_line_number,
                                                                                                      item_line_map[mapping_node1]),
                              file=error_file, flush=True)
                        mapping_errors += 1
                    continue

                item_map[mapping_node1] = mapping_node2
                item_line_map[mapping_node1] = mapping_line_number
                mapping_rows[mapping_line_number] = mrow.copy()

            elif mapping_rule_mode == "same-as-property" or mapping_label == same_as_property_label:
                if mapping_node1 in property_map:
                    if mapping_node2 != property_map[mapping_node1] or not allow_exact_duplicates:
                        print("Duplicate %s for %s at mapping file line %d, originally in line %d" % (mapping_label,
                                                                                                      repr(mapping_node1),
                                                                                                      mapping_line_number,
                                                                                                      property_line_map[mapping_node1]),
                              file=error_file, flush=True)
                        mapping_errors += 1
                    continue
                property_map[mapping_node1] = mapping_node2
                property_line_map[mapping_node1] = mapping_line_number
                mapping_rows[mapping_line_number] = mrow.copy()

            else:
                print("Unknown mapping action %s at line %d of mapping file %s" % (mapping_label,
                                                                                   mapping_line_number,
                                                                                   repr(str(mapping_kgtk_file))),
                      file=error_file, flush=True)
                mapping_errors += 1
                continue
                

        # Close the mapping file.
        mkr.close()
        if rmkw is not None:
            rmkw.close()

        if mapping_errors > 0:
            raise KGTKException("%d errors detected in the mapping file %s" % (mapping_errors, repr(str(mapping_kgtk_file))))

        if len(item_map) == 0 and len(property_map) == 0:
            raise KGTKException("Nothing read from the mapping file %s" % repr(str(mapping_kgtk_file)))

        if verbose:
            print("%d mapping lines, %d excluded for confidence, %d excluded for idempotency." % (mapping_line_number,
                                                                                                  mapping_confidence_exclusions,
                                                                                                  mapping_idempotent_exclusions),
                  file=error_file, flush=True)
            print("%d item mapping rules." % len(item_map), file=error_file, flush=True)
            print("%d property mapping rules." % len(property_map), file=error_file, flush=True)

        if verbose:
            print("Opening the input file %s." % repr(str(input_kgtk_file)), file=error_file, flush=True)
        ikr:  KgtkReader = KgtkReader.open(input_kgtk_file,
                                           options=input_reader_options,
                                           value_options = value_options,
                                           error_file=error_file,
                                           verbose=verbose,
                                           very_verbose=very_verbose,
        )
        trouble = False
        input_node1_idx: int = ikr.get_node1_column_index(node1_column_name)
        input_label_idx: int = ikr.get_label_column_index(label_column_name)
        input_node2_idx: int = ikr.get_node2_column_index(node2_column_name)
        if input_node1_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]:
            trouble = True
            print("Error: Cannot find the input file node1 column.", file=error_file, flush=True)
        if input_label_idx < 0 and mapping_rule_mode in ["normal", "same-as-property"]:
            trouble = True
            print("Error: Cannot find the input file label column.", file=error_file, flush=True)
        if input_node2_idx < 0 and mapping_rule_mode in ["normal", "same-as-item"]:
            trouble = True
            print("Error: Cannot find the input file node2 column.", file=error_file, flush=True)
        if trouble:
            # Clean up:                                                                                                                                               
            ikr.close()
            raise KGTKException("Missing columns in the input file.")

        okw: KgtkWriter = KgtkWriter.open(ikr.column_names,
                                          output_kgtk_file,
                                          mode=KgtkWriter.Mode[ikr.mode.name],
                                          use_mgzip=input_reader_options.use_mgzip, # Hack!
                                          mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                          error_file=error_file,
                                          verbose=verbose,
                                          very_verbose=very_verbose)

        uekw: typing.Optional[KgtkWriter] = None
        if unmodified_edges_kgtk_file is not None:
            if verbose:
                print("Opening the unmodified edges file %s." % repr(str(unmodified_edges_kgtk_file)), file=error_file, flush=True)
            uekw = KgtkWriter.open(ikr.column_names,
                                   unmodified_edges_kgtk_file,
                                   mode=KgtkWriter.Mode[ikr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        amkw: typing.Optional[KgtkWriter] = None
        if activated_mapping_kgtk_file is not None:
            if verbose:
                print("Opening the activated mapping edges file %s." % repr(str(activated_mapping_kgtk_file)), file=error_file, flush=True)
            amkw = KgtkWriter.open(mkr.column_names,
                                   activated_mapping_kgtk_file,
                                   mode=KgtkWriter.Mode[mkr.mode.name],
                                   use_mgzip=input_reader_options.use_mgzip, # Hack!
                                   mgzip_threads=input_reader_options.mgzip_threads, # Hack!
                                   error_file=error_file,
                                   verbose=verbose,
                                   very_verbose=very_verbose)

        # Process each row of the input file.
        if verbose:
            print("Processing the input file.", file=error_file, flush=True)
        input_count: int = 0
        modified_edge_count: int = 0
        unmodified_edge_count: int = 0
        row: typing.List[str]
        for row in ikr:
            input_count +=1
            newrow: typing.List[str] = row.copy()

            modified_node1: bool = False
            modified_node2: bool = False
            modified_label: bool = False

            if mapping_rule_mode in ["normal", "same-as-item"]:
                input_node1: str = row[input_node1_idx]
                if input_node1 in item_map:
                    newrow[input_node1_idx] = item_map[input_node1]
                    modified_node1 = True
                    if amkw is not None:
                        mapping_line_number = item_line_map[input_node1]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]
                        
                input_node2: str = row[input_node2_idx]
                if input_node2 in item_map:
                    newrow[input_node2_idx] = item_map[input_node2]
                    modified_node2 = True
                    if amkw is not None:
                        mapping_line_number = item_line_map[input_node2]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]

            if mapping_rule_mode in ["normal", "same-as-property"]:
                input_label: str = row[input_label_idx]
                if input_label in property_map:
                    newrow[input_label_idx] = property_map[input_label]
                    modified_label = True
                    if amkw is not None:
                        mapping_line_number = property_line_map[input_label]
                        if mapping_line_number not in activated_mapping_rows:
                            activated_mapping_rows[mapping_line_number] = mapping_rows[mapping_line_number]

            modified: bool
            if modified_pattern == "node1|label|node2":
                modified = modified_node1 or modified_label or modified_node2
            elif modified_pattern == "node1|label":
                modified = modified_node1 or modified_label
            elif modified_pattern == "node1|node2":
                modified = modified_node1 or modified_node2
            elif modified_pattern == "label|node2":
                modified = modified_label or modified_node2
            elif modified_pattern == "node1":
                modified = modified_node1
            elif modified_pattern == "label":
                modified = modified_label
            elif modified_pattern == "node2":
                modified = modified_node2
            elif modified_pattern == "node1&label&node2":
                modified = modified_node1 and modified_label and modified_node2
            elif modified_pattern == "node1&label":
                modified = modified_node1 and modified_label
            elif modified_pattern == "node1&node2":
                modified = modified_node1 and modified_node2
            elif modified_pattern == "label&node2":
                modified = modified_label and modified_node2
            else:
                raise KGTKException("Unrecognized modification test pattern %s" % repr(modified_pattern))                

            if modified:
                modified_edge_count += 1
                okw.write(newrow)
            else:
                unmodified_edge_count += 1
                if uekw is not None:
                    uekw.write(row)
                if not split_output_mode:
                    okw.write(row)
                        
        # Done!
        ikr.close()
        okw.close()

        if verbose:
            print("%d edges read. %d modified, %d unmodified." % (input_count, modified_edge_count, unmodified_edge_count), file=error_file, flush=True)

        if uekw is not None:
            uekw.close()

        if amkw is not None:
            activated_count: int = 0
            for mapping_line_number in sorted(activated_mapping_rows.keys()):
                amkw.write(activated_mapping_rows[mapping_line_number])
                activated_count += 1
            amkw.close()

            if verbose:
                print("%d activated mapping edges" % activated_count, file=error_file, flush=True)

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Example #19
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        output_format: typing.Optional[str],
        column_names: typing.List[str],
        omit_remaining_columns: bool,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_kgtk_file),
              file=error_file,
              flush=True)
        if output_format is not None:
            print("--output-format=%s" % output_format,
                  file=error_file,
                  flush=True)
        print("--columns %s" % " ".join(column_names),
              file=error_file,
              flush=True)
        print("--trim=%s" % str(omit_remaining_columns),
              file=error_file,
              flush=True)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        if verbose:
            print("Opening the input file %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        remaining_names: typing.List[str] = kr.column_names.copy()
        reordered_names: typing.List[str] = []
        save_reordered_names: typing.Optional[typing.List[str]] = None

        ellipses: str = "..."  # All unmentioned columns
        ranger: str = ".."  # All columns between two columns.

        saw_ranger: bool = False
        column_name: str
        for column_name in column_names:
            if column_name == ellipses:
                if save_reordered_names is not None:
                    raise KGTKException("Elipses may appear only once")

                if saw_ranger:
                    raise KGTKException(
                        "ELipses may not appear directly after a range operator ('..')."
                    )

                save_reordered_names = reordered_names
                reordered_names = []
                continue

            if column_name == ranger:
                if len(reordered_names) == 0:
                    raise KGTKException(
                        "The column range operator ('..') may not appear without a preceeding column name."
                    )
                saw_ranger = True
                continue

            if column_name not in kr.column_names:
                raise KGTKException("Unknown column name '%s'." % column_name)
            if column_name not in remaining_names:
                raise KGTKException(
                    "Column name '%s' was duplicated in the list." %
                    column_name)

            if saw_ranger:
                saw_ranger = False
                prior_column_name: str = reordered_names[-1]
                prior_column_idx: int = kr.column_name_map[prior_column_name]
                column_name_idx: int = kr.column_name_map[column_name]
                start_idx: int
                end_idx: int
                idx_inc: int
                if column_name_idx > prior_column_idx:
                    start_idx = prior_column_idx + 1
                    end_idx = column_name_idx - 1
                    idx_inc = 1
                else:
                    start_idx = prior_column_idx - 1
                    end_idx = column_name_idx + 1
                    idx_inc = -1

                idx: int = start_idx
                while idx <= end_idx:
                    idx_column_name: str = kr.column_names[idx]
                    if idx_column_name not in remaining_names:
                        raise KGTKException(
                            "Column name '%s' (%s .. %s) was duplicated in the list."
                            % (column_name, prior_column_name, column_name))

                    reordered_names.append(idx_column_name)
                    remaining_names.remove(idx_column_name)
                    idx += idx_inc

            reordered_names.append(column_name)
            remaining_names.remove(column_name)

        if saw_ranger:
            raise KGTKException(
                "The column ranger operator ('..') may not end the list of column names."
            )

        if len(remaining_names) > 0 and save_reordered_names is None:
            # There are remaining column names and the ellipses was not seen.
            if not omit_remaining_columns:
                raise KGTKException(
                    "No ellipses, and the following columns not accounted for: %s"
                    % " ".join(remaining_names))
            else:
                if verbose:
                    print("Omitting the following columns: %s" %
                          " ".join(remaining_names),
                          file=error_file,
                          flush=True)
        if save_reordered_names is not None:
            if len(remaining_names) > 0:
                save_reordered_names.extend(remaining_names)
            if len(reordered_names) > 0:
                save_reordered_names.extend(reordered_names)
            reordered_names = save_reordered_names

        if verbose:
            print("Opening the output file %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(
            reordered_names,
            output_kgtk_file,
            require_all_columns=True,
            prohibit_extra_columns=True,
            fill_missing_columns=False,
            gzip_in_parallel=False,
            mode=KgtkWriter.Mode[kr.mode.name],
            output_format=output_format,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        shuffle_list: typing.List = kw.build_shuffle_list(kr.column_names)

        input_data_lines: int = 0
        row: typing.List[str]
        for row in kr:
            input_data_lines += 1
            kw.write(row, shuffle_list=shuffle_list)

        # Flush the output file so far:
        kw.flush()

        if verbose:
            print("Read %d data lines from file %s" %
                  (input_data_lines, input_kgtk_file),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Example #20
0
def main():
    """
    Test the KGTK ID builder.
    """
    parser: ArgumentParser = ArgumentParser()
    parser.add_argument(
        dest="input_file_path",
        help="The KGTK file with the input data (default=%(default)s)",
        type=Path,
        nargs="?",
        default="-")
    parser.add_argument("-o",
                        "--output-file",
                        dest="output_file_path",
                        help="The KGTK file to write (default=%(default)s).",
                        type=Path,
                        default="-")

    KgtkIdBuilderOptions.add_arguments(parser)

    KgtkReader.add_debug_arguments(parser)
    KgtkReaderOptions.add_arguments(parser, mode_options=True)
    KgtkValueOptions.add_arguments(parser)

    args: Namespace = parser.parse_args()

    error_file: typing.TextIO = sys.stdout if args.errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_args(
        args)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_args(args)
    value_options: KgtkValueOptions = KgtkValueOptions.from_args(args)

    # Show the final option structures for debugging and documentation.
    if args.show_options:
        print("input: %s" % str(args.input_file_path),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(args.output_file_path),
              file=error_file,
              flush=True)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)

    # First create the KgtkReader.  It provides parameters used by the ID
    # column builder. Next, create the ID column builder, which provides a
    # possibly revised list of column names for the KgtkWriter.  Last, create
    # the KgtkWriter.

    # Open the input file.
    kr: KgtkReader = KgtkReader.open(
        args.input_file_path,
        error_file=error_file,
        options=reader_options,
        value_options=value_options,
        verbose=args.verbose,
        very_verbose=args.very_verbose,
    )

    # Create the ID builder.
    idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

    # Open the output file.
    ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                     args.output_file_path,
                                     mode=kr.mode,
                                     require_all_columns=True,
                                     prohibit_extra_columns=True,
                                     fill_missing_columns=False,
                                     gzip_in_parallel=False,
                                     verbose=args.verbose,
                                     very_verbose=args.very_verbose)

    # Process the input file, building IDs.
    idb.process(kr, ew)

    ew.close()
    kr.close()
Example #21
0
    def process(self):
        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        if self.column_name not in kr.column_name_map:
            raise ValueError("Column %s is not in the input file" %
                             (self.column_name))
        column_idx: int = kr.column_name_map[self.column_name]

        where_column_idx: int = -1
        where_value_set: typing.Set[str] = {}
        if self.where_column_name is not None:
            if self.where_column_name not in kr.column_name_map:
                raise ValueError(
                    "Where column '%s' is not in the input file." %
                    (self.where_column_name))
            where_column_idx = kr.column_name_map[self.where_column_name]
            if self.where_values is None or len(self.where_values) == 0:
                raise ValueError("Where column '%s' but no values to test." %
                                 (self.where_column_name))
            else:
                where_value_set = set(self.where_values)

        if self.verbose:
            print("Counting unique values from the %s column in %s" %
                  (self.column_name, self.input_file_path),
                  file=self.error_file,
                  flush=True)
        input_line_count: int = 0
        skip_line_count: int = 0

        value_counts: typing.MutableMapping[str, int] = {}

        row: typing.list[str]
        for row in kr:
            input_line_count += 1
            if where_column_idx >= 0:
                if row[where_column_idx] not in where_value_set:
                    skip_line_count += 1
                    continue
            value: str = row[column_idx]
            if len(value) == 0:
                value = self.empty_value
            if len(value) > 0:
                value = self.prefix + value
                value_counts[value] = value_counts.get(value, 0) + 1

        if self.verbose:
            print(
                "Read %d records, skipped %d, found %d unique non-empty values, %d empty values."
                % (input_line_count, skip_line_count, len(value_counts),
                   input_line_count - len(value_counts)),
                file=self.error_file,
                flush=True)

        # No node mode we can't open the output file until we are done reading
        # the input file, because we need the list of uniqueue values to
        # build the column list.
        output_columns: typing.List[str]
        if self.output_format == "edge":
            output_columns = ["node1", "label", "node2"]
        elif self.output_format == "node":
            output_columns = ["id"]
            for value in sorted(value_counts.keys()):
                # TODO: provide a way to override this check.
                if value in KgtkFormat.NODE1_COLUMN_NAMES:
                    raise ValueError(
                        "Cannot write a KGTK node file with a column named '%s'."
                        % value)
                output_columns.append(value)
        else:
            raise ValueError("Unknown output format %s" %
                             str(self.output_format))

        if self.verbose:
            print("Opening the output file: %s" % self.output_file_path,
                  file=self.error_file,
                  flush=True)

        ew: KgtkWriter = KgtkWriter.open(output_columns,
                                         self.output_file_path,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        if self.output_format == "edge":
            for value in sorted(value_counts.keys()):
                ew.write([value, self.label_value, str(value_counts[value])])
        elif self.output_format == "node":
            row = [self.column_name]
            for value in sorted(value_counts.keys()):
                row.append(str(value_counts[value]))
            ew.write(row)
        else:
            raise ValueError("Unknown output format %s" %
                             str(self.output_format))

        ew.close()
Example #22
0
    def process(self):
        output_column_names: typing.List[str]
        if self.build_id and self.idbuilder_options is not None:
            self.idbuilder = KgtkIdBuilder.from_column_names(
                self.COLUMN_NAMES, self.idbuilder_options)
            output_column_names = self.idbuilder.column_names
        else:
            output_column_names = self.COLUMN_NAMES

        if self.verbose:
            print("Opening output file %s" % str(self.output_file_path),
                  file=self.error_file,
                  flush=True)
        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(output_column_names,
                                         self.output_file_path,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        rw: typing.Optional[typing.TextIO] = None
        if self.reject_file_path is not None:
            if self.verbose:
                print("Opening reject file %s" % str(self.reject_file_path),
                      file=self.error_file,
                      flush=True)
            # Open the reject file. Since the input data is not in KGTK format,
            # we use an ordinary file here.
            if str(self.reject_file_path) == "-":
                rw = sys.stdout
            else:
                rw = open(self.reject_file_path, "wt")

        total_input_line_count: int = 0
        reject_line_count: int = 0

        namespace_line_count: int = self.get_initial_namespaces()

        input_file_path: str
        for input_file_path in self.input_file_paths:
            input_line_count: int = 0

            if self.local_namespace_use_uuid or self.namespace_id_use_uuid or self.newnode_use_uuid:
                if self.override_uuid is not None:
                    self.local_namespace_uuid = self.override_uuid  # for debugging
                else:
                    # Generate a new local namespace UUID.
                    self.local_namespace_uuid = shortuuid.uuid()

            # Open the input file.
            if self.verbose:
                print("Opening the input file: %s" % input_file_path,
                      file=self.error_file,
                      flush=True)
            infile: typing.TestIO
            if str(input_file_path) == "-":
                infile = sys.stdin
            else:
                infile = open(input_file_path, 'rt')

            line: str
            for line in infile:
                input_line_count += 1
                total_input_line_count += 1

                row: typing.List[str]
                valid: bool
                row, valid = self.parse(line, input_line_count)
                if not valid:
                    if rw is not None:
                        rw.write(line)
                    reject_line_count += 1
                    continue

                node1: str
                ok_1: bool
                node1, ok_1 = self.convert_and_validate(
                    row[0], input_line_count, ew)

                label: str
                ok_2: bool
                label, ok_2 = self.convert_and_validate(
                    row[1], input_line_count, ew)

                node2: str
                ok_3: bool
                node2, ok_3 = self.convert_and_validate(
                    row[2], input_line_count, ew)

                if ok_1 and ok_2 and ok_3:
                    self.write_row(ew, node1, label, node2)
                else:
                    if rw is not None:
                        rw.write(line)
                    reject_line_count += 1

            if input_file_path != "-":
                infile.close()

                self.save_namespaces(ew)

        if self.verbose:
            print("Processed %d known namespaces." % (namespace_line_count),
                  file=self.error_file,
                  flush=True)
            print("Processed %d records." % (total_input_line_count),
                  file=self.error_file,
                  flush=True)
            print("Rejected %d records." % (reject_line_count),
                  file=self.error_file,
                  flush=True)
            print("Wrote %d records." % (self.output_line_count),
                  file=self.error_file,
                  flush=True)

        if ew is not None:
            ew.close()

        if rw is not None and self.reject_file_path is not None and self.reject_file_path != "-":
            rw.close()
Example #23
0
    def process(self):
        if self.verbose:
            print("Opening the left edge file: %s" % str(self.left_file_path), file=self.error_file, flush=True)
        left_kr: KgtkReader = KgtkReader.open(self.left_file_path,
                                              who="left input",
                                              options=self.left_reader_options,
                                              value_options = self.value_options,
                                              error_file=self.error_file,
                                              verbose=self.verbose,
                                              very_verbose=self.very_verbose
        )


        if self.verbose:
            print("Opening the right edge file: %s" % str(self.right_file_path), file=self.error_file, flush=True)
        right_kr: KgtkReader = KgtkReader.open(self.right_file_path,
                                               who="right input",
                                               options=self.right_reader_options,
                                               value_options = self.value_options,
                                               error_file=self.error_file,
                                               verbose=self.verbose,
                                               very_verbose=self.very_verbose
        )

        if not self.ok_to_join(left_kr, right_kr):
            left_kr.close()
            right_kr.close()
            return 1

        left_join_idx_list: typing.List[int] = self.build_join_idx_list(left_kr, self.LEFT, self.left_join_columns)
        right_join_idx_list: typing.List[int] = self.build_join_idx_list(right_kr, self.RIGHT, self.right_join_columns)
        if len(left_join_idx_list) != len(right_join_idx_list):
            print("the left join key has %d components, the right join key has %d columns. Exiting." % (len(left_join_idx_list), len(right_join_idx_list)), file=self.error_file, flush=True)
            left_kr.close()
            right_kr.close()
            return 1

        # This might open the input files for a second time. This won't work with stdin.
        joined_key_set: typing.Optional[typing.Set[str]] = self.join_key_sets(left_join_idx_list, right_join_idx_list)

        if self.verbose:
            print("Mapping the column names for the join.", file=self.error_file, flush=True)
        kmc: KgtkMergeColumns = KgtkMergeColumns()
        kmc.merge(left_kr.column_names, prefix=self.left_prefix)
        right_column_names: typing.List[str] = kmc.merge(right_kr.column_names, prefix=self.right_prefix)
        joined_column_names: typing.List[str] = kmc.column_names

        if self.verbose:
            print("       left   columns: %s" % " ".join(left_kr.column_names), file=self.error_file, flush=True)
            print("       right  columns: %s" % " ".join(right_kr.column_names), file=self.error_file, flush=True)
            print("mapped right  columns: %s" % " ".join(right_column_names), file=self.error_file, flush=True)
            print("       joined columns: %s" % " ".join(joined_column_names), file=self.error_file, flush=True)
        
        if self.verbose:
            print("Opening the output edge file: %s" % str(self.output_path), file=self.error_file, flush=True)
        ew: KgtkWriter = KgtkWriter.open(joined_column_names,
                                         self.output_path,
                                         mode=left_kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        output_data_lines: int = 0
        left_data_lines_read: int = 0
        left_data_lines_kept: int = 0
        right_data_lines_read: int = 0
        right_data_lines_kept: int = 0
        
        if self.verbose:
            print("Processing the left input file: %s" % str(self.left_file_path), file=self.error_file, flush=True)
        row: typing.List[str]
        for row in left_kr:
            left_data_lines_read += 1
            if joined_key_set is None:
                ew.write(row)
                output_data_lines += 1
                left_data_lines_kept += 1
            else:
                left_key: str = self.build_join_key(left_kr, left_join_idx_list, row)
                if left_key in joined_key_set:
                    ew.write(row)
                    output_data_lines += 1
                    left_data_lines_kept += 1
        # Flush the output file so far:
        ew.flush()

        if self.verbose:
            print("Processing the right input file: %s" % str(self.right_file_path), file=self.error_file, flush=True)
        right_shuffle_list: typing.List[int] = ew.build_shuffle_list(right_column_names)
        for row in right_kr:
            right_data_lines_read += 1
            if joined_key_set is None:
                ew.write(row, shuffle_list=right_shuffle_list)
                output_data_lines += 1
                right_data_lines_kept += 1
            else:
                right_key: str = self.build_join_key(right_kr, right_join_idx_list, row)
                if right_key in joined_key_set:
                    ew.write(row, shuffle_list=right_shuffle_list)
                    output_data_lines += 1
                    right_data_lines_kept += 1
            
        ew.close()
        if self.verbose:
            print("The join is complete", file=self.error_file, flush=True)
            print("%d left input data lines read, %d kept" % (left_data_lines_read, left_data_lines_kept), file=self.error_file, flush=True)
            print("%d right input data lines read, %d kept" % (right_data_lines_read, right_data_lines_kept), file=self.error_file, flush=True)
            print("%d data lines written." % output_data_lines, file=self.error_file, flush=True)
Example #24
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys
    import typing

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(
        kwargs)
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        idbuilder_options.show(out=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:

        # First create the KgtkReader.  It provides parameters used by the ID
        # column builder. Next, create the ID column builder, which provides a
        # possibly revised list of column names for the KgtkWriter.  Create
        # the KgtkWriter.  Last, process the data stream.

        # Open the input file.
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        # Create the ID builder.
        idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(idb.column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode[kr.mode.name],
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         gzip_in_parallel=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        # Process the input file, building IDs.
        idb.process(kr, ew)

        # Clean up.
        ew.close()
        kr.close()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Example #25
0
    def process(self):

        # Open the input file.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            else:
                print("Reading the input data from stdin",
                      file=self.error_file,
                      flush=True)

        kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            options=self.reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )
        self.id_column_idx = kr.id_column_idx

        # If requested, create the ID column builder.
        # Assemble the list of output column names.
        output_column_names: typing.List[str]
        idb: typing.Optional[KgtkIdBuilder] = None
        if self.build_id:
            if self.idbuilder_options is None:
                raise ValueError(
                    "ID build requested but ID builder options are missing")
            idb = KgtkIdBuilder.new(kr, self.idbuilder_options)
            output_column_names = idb.column_names
        else:
            output_column_names = kr.column_names

        # Build the list of key column edges:
        key_idx_list: typing.List[int] = []

        if len(self.key_column_names) == 0:
            if kr.is_edge_file:
                # Add the KGTK edge file required columns.
                key_idx_list.append(kr.node1_column_idx)
                key_idx_list.append(kr.label_column_idx)
                key_idx_list.append(kr.node2_column_idx)
                if not self.compact_id and kr.id_column_idx >= 0:
                    key_idx_list.append(kr.id_column_idx)

            elif kr.is_node_file:
                # Add the KGTK node file required column:
                key_idx_list.append(kr.id_column_idx)

            else:
                raise ValueError(
                    "The input file is neither an edge nor a node file.  Key columns must be supplied."
                )

        else:
            # Append columns to the list of key column indices,
            # silently removing duplicates, but complaining about unknown names.
            #
            # TODO: warn about duplicates?
            column_name: str
            for column_name in self.key_column_names:
                if column_name not in kr.column_name_map:
                    raise ValueError("Column %s is not in the input file" %
                                     (repr(column_name)))
                key_idx: int = kr.column_name_map[column_name]
                if key_idx not in key_idx_list:
                    key_idx_list.append(key_idx)

        if self.verbose:
            print("key indexes: %s" %
                  " ".join([str(idx) for idx in key_idx_list]),
                  file=self.error_file,
                  flush=True)

        self.keep_first_idx_list.clear()
        if len(self.keep_first_names) > 0:
            keep_first_name: str
            for keep_first_name in self.keep_first_names:
                if keep_first_name not in kr.column_name_map:
                    raise ValueError(
                        "Keep first column %s is not in the input file" %
                        (repr(keep_first_name)))
                keep_first_idx: int = kr.column_name_map[keep_first_name]
                if keep_first_idx in key_idx_list:
                    raise ValueError(
                        "Keep first column %s may not be a key column" %
                        (repr(keep_first_name)))
                self.keep_first_idx_list.append(keep_first_idx)
            if self.verbose:
                print("keep first indexes: %s" %
                      " ".join([str(idx) for idx in self.keep_first_idx_list]),
                      file=self.error_file,
                      flush=True)

        if self.deduplicate:
            if self.compact_id and kr.id_column_idx >= 0 and kr.id_column_idx not in self.keep_first_idx_list:
                self.keep_first_idx_list.append(kr.id_column_idx)

            # Any columns that aren't in the keep_first list and aren't
            # already in key_idx_list will be appended to key_idx_list:
            idx: int
            for idx in range(kr.column_count):
                if idx not in self.keep_first_idx_list and idx not in key_idx_list:
                    key_idx_list.append(idx)

            if self.verbose:
                print("revised key indexes: %s" %
                      " ".join([str(idx) for idx in key_idx_list]),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            key_idx_list_str: typing.List[str] = []
            for key_idx in key_idx_list:
                key_idx_list_str.append(str(key_idx))
            print("key indexes: %s" % " ".join(key_idx_list_str),
                  file=self.error_file,
                  flush=True)

        # Open the output file.
        ew: KgtkWriter = KgtkWriter.open(
            output_column_names,
            self.output_file_path,
            mode=kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            use_mgzip=self.reader_options.use_mgzip,  # Hack!
            mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
            gzip_in_parallel=False,
            verbose=self.verbose,
            very_verbose=self.very_verbose)

        # Open the optional list output file.
        lew: typing.Optional[KgtkWriter] = None
        if self.list_output_file_path is not None:
            lew = KgtkWriter.open(
                output_column_names,
                self.list_output_file_path,
                mode=kr.mode,
                require_all_columns=False,
                prohibit_extra_columns=True,
                fill_missing_columns=True,
                use_mgzip=self.reader_options.use_mgzip,  # Hack!
                mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
                gzip_in_parallel=False,
                verbose=self.verbose,
                very_verbose=self.very_verbose)

        input_line_count: int = 0
        row: typing.List[str] = []
        input_key: str
        prev_input_key: typing.Optional[str] = None
        going_up: typing.Optional[bool] = None
        if self.sorted_input:
            if self.verbose:
                print("Reading the input data from %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if self.verify_sort:
                    if prev_input_key is None:
                        prev_input_key = input_key
                    else:
                        if going_up is None:
                            if prev_input_key < input_key:
                                going_up = True
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                going_up = False
                                prev_input_key = input_key
                            else:
                                pass  # No change in input key
                        elif going_up:
                            if prev_input_key < input_key:
                                prev_input_key = input_key
                            elif prev_input_key > input_key:
                                raise ValueError(
                                    "Line %d sort violation going up: prev='%s' curr='%s'"
                                    % (input_line_count,
                                       prev_input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR),
                                       input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR)))
                            else:
                                pass  # No change in input_key
                        else:
                            if prev_input_key > input_key:
                                prev_input_key = input_key
                            elif prev_input_key < input_key:
                                raise ValueError(
                                    "Line %d sort violation going down: prev='%s' curr='%s'"
                                    % (input_line_count,
                                       prev_input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR),
                                       input_key.replace(
                                           self.field_separator,
                                           KgtkFormat.LIST_SEPARATOR)))
                            else:
                                pass  # No change in input_key

                self.process_row(input_key, row, input_line_count, idb, ew,
                                 lew)

        else:
            if self.verbose:
                print("Sorting the input data from %s" % self.input_file_path,
                      file=self.error_file,
                      flush=True)
            # Map key values to lists of input and output data.
            input_map: typing.MutableMapping[
                str, typing.List[typing.List[str]]] = {}

            for row in kr:
                input_line_count += 1
                input_key = self.build_key(row, key_idx_list)
                if input_key in input_map:
                    # Append the row to an existing list for that key.
                    input_map[input_key].append(row)
                else:
                    # Create a new list of rows for this key.
                    input_map[input_key] = [row]

            if self.verbose:
                print("Processing the sorted input data",
                      file=self.error_file,
                      flush=True)

            for input_key in sorted(input_map.keys()):
                for row in input_map[input_key]:
                    self.process_row(input_key, row, input_line_count, idb, ew,
                                     lew)

        # Flush the final row, if any.  We pass the last row read for
        # feedback, such as an ID uniqueness violation.
        self.process_row("", row, input_line_count, idb, ew, lew, flush=True)

        if self.verbose:
            print("Read %d records, excluded %d records, wrote %d records." %
                  (input_line_count, self.excluded_row_count,
                   self.output_line_count),
                  file=self.error_file,
                  flush=True)
            if lew is not None:
                print("Wrote %d list ouput records." %
                      (self.list_output_line_count),
                      file=self.error_file,
                      flush=True)

        ew.close()
        if lew is not None:
            lew.close()
Example #26
0
    def process_unsorted(self,
                         output_columns: typing.List[str],
                         kr: KgtkReader,
                         column_idxs: typing.List[int],
                         where_column_idx: int,
                         where_value_set: typing.Set[str]):

        if self.verbose:
            print("Counting unique values from the %s columns in %s" % (" ".join([repr(kr.column_names[column_idx]) for column_idx in column_idxs]),
                                                                        repr(str(self.input_file_path))), file=self.error_file, flush=True)
        input_line_count: int = 0
        skip_line_count: int = 0
        skip_value_count: int = 0
        empty_value_count: int = 0

        value_counts: typing.MutableMapping[str, int] = { }
        
        value_filter_re: typing.Optional[typing.Pattern] = None if len(self.value_filter) == 0 else re.compile(self.value_filter)

        row: typing.List[str]
        for row in kr:
            input_line_count += 1
            if where_column_idx >= 0:
                if row[where_column_idx] not in where_value_set:
                    skip_line_count += 1
                    continue
            column_idx: int
            for column_idx in column_idxs:
                if len(row) <= column_idx:
                    raise ValueError("Line %d: Short row (len(row)=%d, column_idx=%d): %s" % (input_line_count, len(row), column_idx, repr(row)))
                value: str = row[column_idx]

                if value_filter_re is not None:
                    match: typing.Optional[typing.Match]
                    if self.value_match_type == "fullmatch":
                        match = value_filter_re.fullmatch(value)
                    elif self.value_match_type == "match":
                        match = value_filter_re.match(value)
                    elif self.value_match_type == "search":
                        match = value_filter_re.search(value)
                    if match is None:
                        skip_value_count += 1
                        continue

                if len(value) == 0:
                    value = self.empty_value
                if len(value) > 0:
                    value = self.prefix + value
                    value_counts[value] = value_counts.get(value, 0) + 1
                else:
                    empty_value_count += 1
                
        if self.verbose:
            print("Read %d records, skipped %d, skipped %d values, found %d unique non-empty values, %d empty values." % (input_line_count,
                                                                                                                          skip_line_count,
                                                                                                                          skip_value_count,
                                                                                                                          len(value_counts),
                                                                                                                          empty_value_count),
                  file=self.error_file, flush=True)

        # In node format we can't open the output file until we are done
        # reading the input file, because we need the list of unique values to
        # build the column list.
        if self.output_format == self.NODE_FORMAT:
            for value in sorted(value_counts.keys()):
                # TODO: provide a way to override this check.
                if value in KgtkFormat.NODE1_COLUMN_NAMES:
                    raise ValueError("Cannot write a KGTK node file with a column named '%s'." % value)
                output_columns.append(value)
        
        if self.verbose:
            print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True)

        ew: KgtkWriter = KgtkWriter.open(output_columns,
                                         self.output_file_path,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         use_mgzip=self.reader_options.use_mgzip if self.reader_options is not None else False, # Hack!
                                         mgzip_threads=self.reader_options.mgzip_threads if self.reader_options is not None else 3, # Hack!
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)        

        if self.output_format == self.EDGE_FORMAT:
            for value in sorted(value_counts.keys()):
                ew.write([value, self.label_value, str(value_counts[value])])

        elif self.output_format == self.NODE_ONLY_FORMAT:
            for value in sorted(value_counts.keys()):
                ew.write([value])

        elif self.output_format == self.NODE_COUNTS_FORMAT:
            for value in sorted(value_counts.keys()):
                ew.write([value, str(value_counts[value])])

        elif self.output_format == self.NODE_FORMAT:
            row = [ kr.column_names[column_idx] ]
            for value in sorted(value_counts.keys()):
                row.append(str(value_counts[value]))
            ew.write(row)

        else:
            raise ValueError("Unknown output format %s" % str(self.output_format))

        if self.verbose:
            print("There were %d unique values." % len(value_counts), file=self.error_file, flush=True)

        ew.close()
Example #27
0
    def process(self):
        kmc: KgtkMergeColumns = KgtkMergeColumns()

        # Is the output file an edge file, a node file, or unknown?
        is_edge_file: bool = False
        is_node_file: bool = False

        krs: typing.List[KgtkReader] = []
        kr: KgtkReader
        idx: int

        if self.verbose:
            print("Starting kgtkcat pid=%d" % (os.getpid()),
                  file=self.error_file,
                  flush=True)

        if self.verbose:
            print("Opening the %d input files." % len(self.input_file_paths),
                  file=self.error_file,
                  flush=True)

        saw_stdin: bool = False
        input_file_path: Path
        for idx, input_file_path in enumerate(self.input_file_paths):
            if str(input_file_path) == "-":
                if saw_stdin:
                    raise ValueError("Duplicate standard input file %d" %
                                     (idx + 1))
                else:
                    saw_stdin = False
                if self.verbose:
                    print("Opening file %d: standard input" % (idx + 1),
                          file=self.error_file,
                          flush=True)
            else:
                if self.verbose:
                    print("Opening file %d: %s" %
                          (idx + 1, str(input_file_path)),
                          file=self.error_file,
                          flush=True)

            kr = KgtkReader.open(
                input_file_path,
                who="input " + str(idx + 1),
                options=self.reader_options,
                value_options=self.value_options,
                error_file=self.error_file,
                verbose=self.verbose,
                very_verbose=self.very_verbose,
            )
            krs.append(kr)

            # Unless directed otherwise, do not merge edge files with node
            # files.  If options.mode == KgtkReaderMode.NONE, then neither
            # kr.is_edge_file nor kr.is_node_file will be set and the
            # consistency check will be skipped.
            if kr.is_edge_file:
                if is_node_file:
                    # Close the open files before raising the exception.
                    #
                    # TODO: Use a try..finally block to ensure these files are closed.
                    for kr2 in krs:
                        kr2.close()
                    raise ValueError(
                        "Cannot merge an edge file to a node file: %s" %
                        input_file_path)
                if is_edge_file == False and self.verbose:
                    print("The output file will be an edge file.",
                          file=self.error_file,
                          flush=True)
                is_edge_file = True
            elif kr.is_node_file:
                if is_edge_file:
                    # Close the open files before raising the exception.
                    #
                    # TODO: Use a try..finally block to ensure these files are closed.
                    for kr2 in krs:
                        kr2.close()
                    raise ValueError(
                        "Cannot merge a node file to an edge file: %s" %
                        input_file_path)
                if is_node_file == False and self.verbose:
                    print("The output file will be an node file.",
                          file=self.error_file,
                          flush=True)
                is_node_file = True

            if self.verbose or self.very_verbose:
                print("Mapping the %d column names in %s." %
                      (len(kr.column_names), input_file_path),
                      file=self.error_file,
                      flush=True)
            if self.very_verbose:
                print(" ".join(kr.column_names),
                      file=self.error_file,
                      flush=True)
            new_column_names: typing.List[str] = kmc.merge(kr.column_names)
            if self.very_verbose:
                print(" ".join(new_column_names),
                      file=self.error_file,
                      flush=True)

        if self.verbose or self.very_verbose:
            print("There are %d merged columns." % len(kmc.column_names),
                  file=self.error_file,
                  flush=True)
        if self.very_verbose:
            print(" ".join(kmc.column_names), file=self.error_file, flush=True)

        if self.output_column_names is not None:
            if self.verbose:
                print("There are %d new output column names." %
                      len(self.output_column_names),
                      file=self.error_file,
                      flush=True)
            if len(self.output_column_names) != len(kmc.column_names):
                # Close the open files before raising the exception.
                #
                # TODO: Use a try..finally block to ensure these files are closed.
                for kr2 in krs:
                    kr2.close()
                raise ValueError(
                    "There are %d merged columns, but %d output column names."
                    % (len(kmc.column_names), len(self.output_column_names)))

        output_mode: KgtkWriter.Mode = KgtkWriter.Mode.NONE
        if is_edge_file:
            output_mode = KgtkWriter.Mode.EDGE
            if self.verbose:
                print("Opening the output edge file: %s" %
                      str(self.output_path),
                      file=self.error_file,
                      flush=True)
        elif is_node_file:
            output_mode = KgtkWriter.Mode.NODE
            if self.verbose:
                print("Opening the output node file: %s" %
                      str(self.output_path),
                      file=self.error_file,
                      flush=True)
        else:
            if self.verbose:
                print("Opening the output file: %s" % str(self.output_path),
                      file=self.error_file,
                      flush=True)

        ew: KgtkWriter = KgtkWriter.open(
            kmc.column_names,
            self.output_path,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            use_mgzip=self.reader_options.use_mgzip,  # Hack!
            mgzip_threads=self.reader_options.mgzip_threads,  # Hack!
            gzip_in_parallel=False,
            mode=output_mode,
            output_format=self.output_format,
            output_column_names=self.output_column_names,
            old_column_names=self.old_column_names,
            new_column_names=self.new_column_names,
            verbose=self.verbose,
            very_verbose=self.very_verbose)

        output_data_lines: int = 0
        for idx, kr in enumerate(krs):
            if kr.file_path is None:
                # This shouldn't happen because we constrined all
                # input_file_path elements to be not None.  However,
                # checking here keeps mypy happy.
                #
                # TODO: throw a better exception.
                #
                # Close the open files before raising the exception.
                #
                # TODO: Use a try..finally block to ensure these files are closed.
                for kr2 in krs:
                    kr2.close()
                raise ValueError("Missing file path.")
            input_file_path = kr.file_path
            if self.verbose:
                print("Copying data from file %d: %s" %
                      (idx + 1, input_file_path),
                      file=self.error_file,
                      flush=True)

            shuffle_list: typing.List[int] = ew.build_shuffle_list(
                kmc.new_column_name_lists[idx])

            input_data_lines: int = 0
            row: typing.List[str]
            for row in kr:
                input_data_lines += 1
                output_data_lines += 1
                ew.write(row, shuffle_list=shuffle_list)

            # Flush the output file so far:
            ew.flush()

            if self.verbose:
                print("Read %d data lines from file %d: %s" %
                      (input_data_lines, idx + 1, input_file_path),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            print("Wrote %d lines total from %d files" %
                  (output_data_lines, len(krs)),
                  file=self.error_file,
                  flush=True)

        # Close the open files.
        ew.close()
        for kr2 in krs:
            kr2.close()
Example #28
0
    def process_presorted(self,
                          output_columns: typing.List[str],
                          kr: KgtkReader,
                          column_idx: int,
                          where_column_idx: int,
                          where_value_set: typing.Set[str]):

        if self.verbose:
            print("Counting unique values from the %s column in presorted %s" % (kr.column_names[column_idx], self.input_file_path), file=self.error_file, flush=True)
        input_line_count: int = 0
        skip_line_count: int = 0
        skip_value_count: int = 0
        empty_value_count: int = 0
        unique_value_count: int = 0

        previous_value: typing.Optional[str] = None
        value_count: int

        if self.verbose:
            print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True)

        ew: KgtkWriter = KgtkWriter.open(output_columns,
                                         self.output_file_path,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         use_mgzip=self.reader_options.use_mgzip if self.reader_options is not None else False, # Hack!
                                         mgzip_threads=self.reader_options.mgzip_threads if self.reader_options is not None else 3, # Hack!
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)        

        value_filter_re: typing.Optional[typing.Pattern] = None if len(self.value_filter) == 0 else re.compile(self.value_filter)

        row: typing.List[str]
        for row in kr:
            input_line_count += 1
            if where_column_idx >= 0:
                if row[where_column_idx] not in where_value_set:
                    skip_line_count += 1
                    continue
            if len(row) <= column_idx:
                raise ValueError("Line %d: Short row (len(row)=%d, column_idx=%d): %s" % (input_line_count, len(row), column_idx, repr(row)))
            value: str = row[column_idx]

            if value_filter_re is not None:
                match: typing.Optional[typing.Match]
                if self.value_match_type == "fullmatch":
                    match = value_filter_re.fullmatch(value)
                elif self.value_match_type == "match":
                    match = value_filter_re.match(value)
                elif self.value_match_type == "search":
                    match = value_filter_re.search(value)
                if match is None:
                    skip_value_count += 1
                    continue

            if len(value) == 0:
                value = self.empty_value
            if len(value) > 0:
                value = self.prefix + value

            if previous_value is None:
                previous_value = value
                value_count = 1
                unique_value_count += 1

            else:
                if value < previous_value:
                    raise ValueError("Line %d: input is not presorted: value %s < previous value %s" % (input_line_count, repr(previous_value), repr(value)))

                if value == previous_value:
                    value_count += 1

                else:
                    if len(previous_value) == 0:
                        empty_value_count = value_count

                    elif self.output_format == self.EDGE_FORMAT:
                        ew.write([previous_value, self.label_value, str(value_count)])

                    elif self.output_format == self.NODE_ONLY_FORMAT:
                        ew.write([previous_value])

                    elif self.output_format == self.NODE_COUNTS_FORMAT:
                        ew.write([previous_value, str(value_count)])

                    else:
                        raise ValueError("Unknown output format %s" % str(self.output_format))

                    previous_value = value
                    value_count = 1
                    unique_value_count += 1

        if previous_value is not None:
            if len(previous_value) == 0:
                empty_value_count = value_count

            elif self.output_format == self.EDGE_FORMAT:
                ew.write([previous_value, self.label_value, str(value_count)])

            elif self.output_format == self.NODE_ONLY_FORMAT:
                ew.write([previous_value])

            elif self.output_format == self.NODE_COUNTS_FORMAT:
                ew.write([previous_value, str(value_count)])

            else:
                raise ValueError("Unknown output format %s" % str(self.output_format))
            

        if self.verbose:
            print("Read %d records, skipped %d, skipped %d values, found %d unique non-empty values, %d empty values." % (input_line_count,
                                                                                                                          skip_line_count,
                                                                                                                          skip_value_count,
                                                                                                                          unique_value_count,
                                                                                                                          empty_value_count),
                  file=self.error_file, flush=True)

        ew.close()
Example #29
0
    def process(self):
        UPDATE_VERSION: str = "2020-08-24T21:47:20.256050+00:00#mr0wtMHlN/QaplDsGc/ylG3Hw5stsjziykzuGlSHBSion4xoW/Bec0sn55IQ7wFWBUClRS7g1tbAuaqEduhUVA=="
        if self.show_version or self.verbose:
            print("KgtkIfEfexists version: %s" % UPDATE_VERSION, file=self.error_file, flush=True)

        # Open the input files once.
        if self.verbose:
            if self.input_file_path is not None:
                print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True)
            else:
                print("Reading the input data from stdin", file=self.error_file, flush=True)

        input_kr: KgtkReader =  KgtkReader.open(self.input_file_path,
                                                error_file=self.error_file,
                                                who="input",
                                                options=self.input_reader_options,
                                                value_options = self.value_options,
                                                verbose=self.verbose,
                                                very_verbose=self.very_verbose,
        )

        if self.verbose:
            print("Opening the filter input file: %s" % self.filter_file_path, file=self.error_file, flush=True)
        filter_kr: KgtkReader = KgtkReader.open(self.filter_file_path,
                                                who="filter",
                                                error_file=self.error_file,
                                                options=self.filter_reader_options,
                                                value_options=self.value_options,
                                                verbose=self.verbose,
                                                very_verbose=self.very_verbose,
        )

        input_key_columns: typing.List[int] = self.get_key_columns(self.input_keys, input_kr, filter_kr, "input")
        filter_key_columns: typing.List[int] = self.get_key_columns(self.filter_keys, filter_kr, input_kr, "filter")

        if len(input_key_columns) != len(filter_key_columns):
            print("There are %d input key columns but %d filter key columns.  Exiting." % (len(input_key_columns), len(filter_key_columns)),
                  file=self.error_file, flush=True)
            return

        ew: typing.Optional[KgtkWriter] = None
        if self.output_file_path is not None:
            if self.verbose:
                print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True)
            ew = KgtkWriter.open(input_kr.column_names,
                                 self.output_file_path,
                                 mode=input_kr.mode,
                                 require_all_columns=False,
                                 prohibit_extra_columns=True,
                                 fill_missing_columns=True,
                                 gzip_in_parallel=False,
                                 verbose=self.verbose,
                                 very_verbose=self.very_verbose)
            
        rew: typing.Optional[KgtkWriter] = None
        if self.reject_file_path is not None:
            if self.verbose:
                print("Opening the reject file: %s" % self.reject_file_path, file=self.error_file, flush=True)
            rew = KgtkWriter.open(input_kr.column_names,
                                  self.reject_file_path,
                                  mode=input_kr.mode,
                                  require_all_columns=False,
                                  prohibit_extra_columns=True,
                                  fill_missing_columns=True,
                                  gzip_in_parallel=False,
                                  verbose=self.verbose,
                                  very_verbose=self.very_verbose)
            
        if self.cache_input:
            if self.preserve_order:
                self.process_cacheing_input_preserving_order(input_kr=input_kr,
                                                             filter_kr=filter_kr,
                                                             input_key_columns=input_key_columns,
                                                             filter_key_columns=filter_key_columns,
                                                             ew=ew,
                                                             rew=rew)
            else:
                self.process_cacheing_input(input_kr=input_kr,
                                            filter_kr=filter_kr,
                                            input_key_columns=input_key_columns,
                                            filter_key_columns=filter_key_columns,
                                            ew=ew,
                                            rew=rew)
        else:
            self.process_cacheing_filter(input_kr=input_kr,
                                         filter_kr=filter_kr,
                                         input_key_columns=input_key_columns,
                                         filter_key_columns=filter_key_columns,
                                         ew=ew,
                                         rew=rew)

        if ew is not None:
            ew.close()
        if rew is not None:
            rew.close()
Example #30
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        columns: typing.Optional[typing.List[str]] = None,
        labels: typing.Optional[typing.List[str]] = None,
        id_column_name: typing.Optional[str] = None,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    import os

    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)

        if columns is not None:
            print("--columns=%s" % " ".join(columns), file=error_file)
        if labels is not None:
            print("--labels=%s" % " ".join(labels), file=error_file)
        if id_column_name is not None:
            print("--id-column=%s" % id_column_name, file=error_file)

        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    if verbose:
        print("Starting normalize_nodes pid=%d" % (os.getpid()),
              file=error_file,
              flush=True)

    label_map: typing.MutableMapping[str, str] = dict()
    if labels is not None and len(labels) > 0:
        if columns is None:
            raise KGTKException(
                "--columns must be supplied when --labels is used.")
        if len(columns) != len(labels):
            raise KGTKException("%d columns were supplied, but %d labels." %
                                (len(columns), len(labels)))
        idx: int
        label: str
        for idx, label in enumerate(labels):
            label_map[columns[idx]] = label

    try:
        if verbose:
            print("Opening the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        id_column_idx: int = kr.get_id_column_index(id_column_name)
        if id_column_idx < 0:
            raise KGTKException("Unknown ID column %s" % repr(id_column_name))

        output_column_names: typing.List[str] = [
            KgtkFormat.NODE1, KgtkFormat.LABEL, KgtkFormat.NODE2
        ]

        if verbose:
            print("Opening the output file: %s" % str(output_kgtk_file),
                  file=error_file,
                  flush=True)
        kw: KgtkWriter = KgtkWriter.open(output_column_names,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        input_line_count: int = 0
        output_line_count: int = 0
        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            node1_value: str = row[id_column_idx]

            column_idx: int
            column_name: str
            for column_idx, column_name in enumerate(kr.column_names):
                if column_idx == id_column_idx:
                    continue
                if columns is not None and column_name not in columns:
                    continue

                label_value: str = label_map.get(column_name, column_name)

                new_value: str = row[column_idx]
                if len(new_value) == 0:
                    continue  # ignore empty values.

                # The column value might contain a KGTK list.  Since node2 isn't supposed
                # to contain lists, we'll split it.
                node2_value: str
                for node2_value in KgtkValue.split_list(new_value):
                    if len(node2_value) == 0:
                        continue  # node2 shouldn't contain empty values

                    output_row: typing.List[str] = [
                        node1_value, label_value, node2_value
                    ]
                    kw.write(output_row)
                    output_line_count += 1

        if verbose:
            print("Read %d node rows, wrote %d edge rows." %
                  (input_line_count, output_line_count),
                  file=error_file,
                  flush=True)

        kw.close()

        return 0

    except Exception as e:
        kgtk_exception_auto_handler(e)
        return 1