Exemple #1
0
def load_property_labels_file(input_files: typing.List[str]):
    labels_dict: typing.MutableMapping[str, str] = {}
    headers: typing.Optional[typing.List[str]] = None
    for each_file in input_files:
        with open(each_file, "r") as f:
            each_line: str
            for each_line in f.readlines():
                fields: typing.List[str] = each_line.replace("\n", "").split("\t")
                if headers is None:
                    headers = fields
                    if len(headers) < 2:
                        raise KGTKException(
                            "No enough columns found on given input file. Only {} columns given but at least 2 needed.".format(
                                len(headers)))
                    elif "predicate" in headers and "label" in headers:
                        column_references = {"predicate": headers.index("predicate"),
                                             "label": headers.index("label")}
                    elif "label" in headers:
                        column_references = {"predicate": 0,
                                             "label": headers.index("label"),
                                             }
                    else:
                        raise KGTKException("Can't determine which column is label column for label file!")

                else:
                    node_id: str = fields[column_references["predicate"]]
                    node_label: str = fields[column_references["label"]]
                    if "@en" in node_label:
                        node_label = node_label.replace("'", "").split("@")[0]
                        labels_dict[node_id] = node_label
                    if node_id not in labels_dict:
                        labels_dict[node_id] = node_label
    return labels_dict
Exemple #2
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        key_column_names: typing.List[str],
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.reshape.kgtkexpand import KgtkExpand
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ex: KgtkExpand = KgtkExpand(
            input_file_path=input_kgtk_file,
            key_column_names=key_column_names,
            output_file_path=output_kgtk_file,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Exemple #3
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        filter_column_names: typing.List[str],
        all_are: bool = False,
        only_count: bool = False,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_kgtk_file), file=error_file)
        print("--output-file=%s" % str(output_kgtk_file), file=error_file)
        print("--columns=%s" % " ".join(filter_column_names), file=error_file)
        print("--count=%s" % str(only_count), file=error_file)
        print("--all=%s" % str(all_are), file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ie: KgtkIfEmpty = KgtkIfEmpty(
            input_file_path=input_kgtk_file,
            filter_column_names=filter_column_names,
            output_file_path=output_kgtk_file,
            all_are=all_are,
            notempty=False,
            only_count=only_count,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ie.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Exemple #4
0
def parse_query_command(command):
    """Parse a query 'command' into a normalized options dictionary.
    'command' may be a single string or an iterable of parsed arguments
    and may optionally contain 'kgtk' and 'query' particles if one
    wants to mirror top-level KGTK query commands exactly.
    """
    args = None
    if isinstance(command, str):
        args = shlex.split(command)
    elif command is not None:
        args = [str(arg) for arg in command]
    if not args:
        return {}

    parsed_args, rest_args = COMMAND_ARGUMENT_PARSER.parse_known_args(args)
    for rarg in rest_args:
        if rarg not in ('kgtk', 'query'):
            raise KGTKException(f'illegal query API option: {rarg}')
    # convert to dict:
    parsed_args = vars(parsed_args)
    if parsed_args.get('output') not in (None, '-'):
        raise KGTKException('output specification not supported in query API')
    loglevel = parsed_args.get('loglevel')
    options = cliquery.preprocess_query_options(**parsed_args)
    # if loglevel was set explicitly, override with it:
    if loglevel is not None:
        options['loglevel'] = loglevel
    elif options.get('loglevel') == 0:
        # undo the default in favor of API default:
        options['loglevel'] = None
    # remove all None values so we can use 'get' with defaults:
    return {k: v for k, v in options.items() if v is not None}
Exemple #5
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        output_format: str,
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    input_file_path: Path = KGTKArgumentParser.get_input_file(input_file)
    output_file_path: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # TODO: check that at most one input file is stdin?

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("--input-file=%s" % str(input_file_path),
              file=error_file,
              flush=True)
        print("--output-file=%s" % str(output_file_path),
              file=error_file,
              flush=True)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        kc: KgtkCat = KgtkCat(input_file_paths=[input_file_path],
                              output_path=output_file_path,
                              output_format=output_format,
                              reader_options=reader_options,
                              value_options=value_options,
                              error_file=error_file,
                              verbose=verbose,
                              very_verbose=very_verbose)

        kc.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Exemple #6
0
def run(
        input_kgtk_file: typing.Optional[Path],
        output_kgtk_file: typing.Optional[Path],
        key_column_names: typing.List[str],
        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from kgtk.exceptions import KGTKException

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    # Show the final option structures for debugging and documentation.
    if show_options:
        print("input: %s" %
              (str(input_kgtk_file) if input_kgtk_file is not None else "-"),
              file=error_file)
        print("--columns=%s" % " ".join(key_column_names), file=error_file)
        print("--output-file=%s" %
              (str(output_kgtk_file) if output_kgtk_file is not None else "-"),
              file=error_file)
        reader_options.show(out=error_file)
        value_options.show(out=error_file)
        print("=======", file=error_file, flush=True)

    try:
        ex: KgtkExpand = KgtkExpand(
            input_file_path=input_kgtk_file,
            key_column_names=key_column_names,
            output_file_path=output_kgtk_file,
            reader_options=reader_options,
            value_options=value_options,
            error_file=error_file,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        ex.process()

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Exemple #7
0
def run(
        kgtk_browser_host: str = '0.0.0.0',
        kgtk_browser_port: str = '5000',
        kgtk_browser_config: str = 'kgtk_browser_config.py',
        kgtk_browser_app: str = 'kgtk_browser_app.py',

        errors_to_stdout: bool = False,
        errors_to_stderr: bool = True,
        show_options: bool = False,
        verbose: bool = False,
        very_verbose: bool = False,

        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    # import modules locally
    from pathlib import Path
    import simplejson as json
    import webbrowser
    import threading
    import os, sys
    import typing

    from kgtk.exceptions import KGTKException

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    try:

        # Set the flask app and configuration file settings
        os.environ["FLASK_APP"] = kgtk_browser_app
        os.environ["KGTK_BROWSER_CONFIG"] = kgtk_browser_config

        # Open the default web browser at the kgtk-browser location
        url = "http://{}:{}/browser".format(kgtk_browser_host, kgtk_browser_port)
        threading.Timer(2.5, lambda: webbrowser.open(url)).start()

        # Run flask app using the selected host and port
        os.system(
            "flask run --host {} --port {}".format(
                kgtk_browser_host,
                kgtk_browser_port,
            )
        )

        return 0

    except SystemExit as e:
        raise KGTKException("Exit requested")
    except Exception as e:
        raise KGTKException(str(e))
Exemple #8
0
def run(input=None, output=None, columns='1', reverse=False, space=False, speed=False, extra='', tsv=False, csv=False, _dt=None):
    """Run sort according to the provided command-line arguments.
    """
    try:
        colsep = '\t'
        if not tsv and (csv or _dt == 'csv'):
            colsep = ','

        options = extra
        if reverse:
            options += ' -r'
        if space:
            options += ' ' + space_config
        elif speed:
            options += ' ' + speed_config
            
        pipe = build_command(input=input, output=output, columns=columns, colsep=colsep, options=options)
        return zcat.run_sh_commands(pipe).exit_code
    except sh.SignalException_SIGPIPE:
        # hack to work around Python3 issue when stdout is gone when we try to report an exception;
        # without this we get an ugly 'Exception ignored...' msg when we quit with head or a pager:
        sys.stdout = os.fdopen(1)
    except Exception as e:
        #import traceback
        #traceback.print_tb(sys.exc_info()[2], 10)
        raise KGTKException('INTERNAL ERROR: ' + type(e).__module__ + '.' + str(e) + '\n')
def run(labels: str, aliases: str, descriptions: str, property_file: str,
        n: int, truthy: bool, warning: bool, use_id: bool, log_path: str,
        prop_declaration: bool, prefix_path: str, input_file: KGTKFiles,
        output_file: str, error_action: str):
    # import modules locally

    from kgtk.generator import TripleGenerator
    from kgtk.exceptions import KGTKException

    generator = TripleGenerator(prop_file=property_file,
                                label_set=labels,
                                alias_set=aliases,
                                description_set=descriptions,
                                n=n,
                                warning=warning,
                                truthy=truthy,
                                use_id=use_id,
                                dest_fp=output_file,
                                log_path=log_path,
                                prop_declaration=prop_declaration,
                                prefix_path=prefix_path,
                                input_file=input_file,
                                error_action=error_action)

    try:
        generator.process()
    except Exception as e:
        raise KGTKException(e)
Exemple #10
0
def build_sort_key_spec(header, columns, colsep='\t'):
    """Given a KGTK file `header' line and a user-provided `columns' spec, generate a sequence of
    Unix sort key definitions representative of those columns.  For example, columns=subject,object
    will translate into '-k 1,1 -k 3,3'.  Columns can be specified by the names used in the file
    header line, as 1-based positions, or through the pre-defined positions of reserved names such
    as `subject', etc.  Columns found in the header will override any predefined positions.
    """
    import re
    columns = [
        c.strip() for c in re.split(column_spec_split_regex, columns.strip())
    ]
    header = [c.strip() for c in header.split(colsep)]
    keys = []
    for col in columns:
        if col == '':
            continue
        index = None
        if col in header:
            index = header.index(col) + 1
        else:
            try:
                index = int(col)
            except:
                pass
        if index is None:
            index = reserved_name_columns.get(col)
        if index is None:
            raise KGTKException('Unknown column: ' + col)
        keys.append('-k %d,%d' % (index, index))
    # special whitespace at the end is used by `wait_for_key_spec' below:
    return ' '.join(keys) + ' \t'
Exemple #11
0
def run(input_file: KGTKFiles,
        output_file: KGTKFiles,
        no_header: bool = False,
        properties: str = '',
        undirected: bool = False,
        strong: bool = False,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
        ) -> int:
    from kgtk.gt.connected_components import ConnectedComponents
    from kgtk.exceptions import KGTKException

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    cc: ConnectedComponents = ConnectedComponents(input_file_path=input_kgtk_file,
                                                  output_file_path=output_kgtk_file,
                                                  no_header=no_header,
                                                  properties=properties,
                                                  undirected=undirected,
                                                  strong=strong)

    try:
        cc.process()
        return 0
    except Exception as e:
        raise KGTKException(str(e))
Exemple #12
0
 def execute_to_file(self, file=sys.stdout, noheader=False, **params):
     """Execute this query with the given 'params' and write the result to the file or
     file-like object 'file' in KGTK format.  Output a header unless 'noheader' is true.
     """
     self.refresh()
     if hasattr(self.exec_wrapper, 'cache_clear'):
         # if this is a re-call of a caching query, ensure it is cleared,
         # since the result iterator was used up in the previous call:
         self.exec_wrapper.cache_clear()
     parameters = self._subst_params(self.parameters, params)
     try:
         out = open(file, 'w') if isinstance(file, str) else file
         if not hasattr(out, 'write'):
             raise KGTKException('expected file or file-like object')
         result = self.exec_wrapper(self, parameters, iter)
         csvwriter = csv.writer(out,
                                dialect=None,
                                delimiter='\t',
                                quoting=csv.QUOTE_NONE,
                                quotechar=None,
                                lineterminator='\n',
                                escapechar=None)
         if not noheader:
             csvwriter.writerow(self.get_result_header())
         csvwriter.writerows(result)
     finally:
         if isinstance(file, str):
             out.close()
Exemple #13
0
 def _exec(self, parameters, fmt):
     """Internal query execution wrapper that can easily be memoized.
     """
     # TO DO: abstract some of this better in KgtkQuery API
     kgtk_query = self.kgtk_query
     result = kgtk_query.store.execute(self.sql, parameters)
     if kgtk_query.result_header is None:
         kgtk_query.result_header = [
             kgtk_query.unalias_column_name(c[0])
             for c in result.description
         ]
     if fmt is None:
         # convert to list so we can reuse if we memoize:
         return tuple(result)
     # allow types and their names:
     fmt = hasattr(fmt, '__name__') and fmt.__name__ or str(fmt)
     if fmt == 'iter':
         return result
     elif fmt == 'tuple':
         return tuple(result)
     elif fmt == 'list':
         return list(result)
     elif fmt in ('df', 'dataframe', 'DataFrame'):
         if not _have_pandas:
             _import_pandas()
         return pd.DataFrame(result, columns=kgtk_query.result_header)
     # TO DO: consider supporting namedtuple and/or sqlite3.Row as row_factory types
     #        (for sqlite3.Row we have the issue that aliases become keys())
     else:
         raise KGTKException('unsupported query result format: %s' % fmt)
Exemple #14
0
 def calculate_distance(a, b):
     if len(a) != len(b):
         raise KGTKException("Vector dimension are different!")
     dist = 0
     for v1, v2 in zip(a, b):
         dist += (v1 - v2)**2
     dist = dist**0.5
     return dist
Exemple #15
0
def run(filename, directed, log_file, output):
    from kgtk.exceptions import KGTKException

    def infer_index(h, options=[]):
        for o in options:
            if o in h:
                return h.index(o)
        return -1

    def infer_predicate(h, options=[]):
        for o in options:
            if o in h:
                return o
        return ''

    try:
        # import modules locally
        import socket
        from graph_tool import load_graph_from_csv
        from graph_tool import centrality
        import kgtk.gt.analysis_utils as gtanalysis
        import sys

        with open(filename, 'r') as f:
            header = next(f).split('\t')
            subj_index = infer_index(header, options=['node1', 'subject'])
            obj_index = infer_index(header,
                                    options=['node2', 'object', 'value'])
            predicate = infer_predicate(
                header, options=['property', 'predicate', 'label'])

            p = []
            for i, header_col in enumerate(header):
                if i in [subj_index, obj_index]: continue
                p.append(header_col)

        with open(log_file, 'w') as writer:
            writer.write('loading the TSV graph now ...\n')
            G2 = load_graph_from_csv(filename,
                                     skip_first=True,
                                     directed=directed,
                                     hashed=True,
                                     ecols=[subj_index, obj_index],
                                     eprop_names=p,
                                     csv_options={'delimiter': '\t'})

            writer.write('graph loaded! It has %d nodes and %d edges\n' %
                         (G2.num_vertices(), G2.num_edges()))
            writer.write('\n###Top relations:\n')
            for rel, freq in gtanalysis.get_topN_relations(
                    G2, pred_property=predicate):
                writer.write('%s\t%d\n' % (rel, freq))

            if output:
                writer.write('now saving the graph to %s\n' % output)
                G2.save(output)
    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Exemple #16
0
 def wait_for_key_spec():
     import time
     for i in range(100): # try for at most 5 secs:
         with open(sort_env['KGTK_SORT_KEY_SPEC'], 'r') as inp:
             if inp.read().endswith('\t'):
                 break
             time.sleep(0.05)
     else:
         raise KGTKException('INTERNAL ERROR: failed to communicate sort key')
Exemple #17
0
def run(input_file,
        output_file,
        columns='1',
        reverse=False,
        space=False,
        speed=False,
        extra='',
        tsv=False,
        csv=False,
        _dt=None,
        naptime=1):
    import time

    import kgtk.cli.zconcat as zcat

    time.sleep(int(naptime))
    # print("Sort running.", file=sys.stderr, flush=True) # ***

    input = str(KGTKArgumentParser.get_input_file(input_file))
    output = str(KGTKArgumentParser.get_output_file(output_file))
    if output == "-":
        output = None

    # logging.basicConfig(level=logging.INFO)
    """Run sort according to the provided command-line arguments.
    """
    try:
        colsep = '\t'
        if not tsv and (csv or _dt == 'csv'):
            colsep = ','

        options = extra
        if reverse:
            options += ' -r'
        if space:
            options += ' ' + space_config
        elif speed:
            options += ' ' + speed_config

        pipe = build_command(input=input,
                             output=output,
                             columns=columns,
                             colsep=colsep,
                             options=options)
        # print("pipe: %s" % str(pipe), file=sys.stderr, flush=True) # ***
        return zcat.run_sh_commands(pipe).exit_code
    except sh.SignalException_SIGPIPE:
        # hack to work around Python3 issue when stdout is gone when we try to report an exception;
        # without this we get an ugly 'Exception ignored...' msg when we quit with head or a pager:
        sys.stdout = os.fdopen(1)
    except Exception as e:
        #import traceback
        #traceback.print_tb(sys.exc_info()[2], 10)
        raise KGTKException('INTERNAL ERROR: ' + type(e).__module__ + '.' +
                            str(e) + '\n')
Exemple #18
0
 def get_input(self, key):
     """Get the canonical input named by 'key' so it can be used with 'get_query'.
     'key' maybe an input file name, input file alias or API-local name.
     """
     info = self.get_input_info(key)
     if info is None:
         raise KGTKException('no input named by key: %s' % key)
     inp = info.get('alias')
     if inp is None:
         inp = info.get('file')
     return inp
Exemple #19
0
 def get_result_header(self, error=True):
     """Return the list of column names for this query.  This requires the query to have
     run at least once (also again after caches were cleared).
     """
     self.refresh()
     header = self.kgtk_query.result_header
     if header is None and error:
         raise KGTKException(
             'query needs to be run at least once to access its result header'
         )
     return header
Exemple #20
0
 def _get_query(self, query, error=True):
     """Internal accessor that allows transparent 'query' access via objects or names.
     """
     if isinstance(query, KypherQuery):
         return query
     kypher_query = self.cached_queries.get(query)
     if kypher_query is not None:
         return kypher_query
     elif error:
         raise KGTKException('cannot find query with name: %s' % query)
     else:
         return None
Exemple #21
0
 def wait_for_key_spec():
     # print("wait_for_key_spec", file=sys.stderr, flush=True) # ***
     import time
     for i in range(100):  # try for at most 5 secs:
         with open(sort_env['KGTK_SORT_KEY_SPEC'], 'r') as inp:
             x = inp.read()
             # print("inp: '%s'" % x, file=sys.stderr, flush=True) # ***
             if x.endswith('\t'):
                 break
             time.sleep(0.05)
     else:
         raise KGTKException(
             'INTERNAL ERROR: failed to communicate sort key')
Exemple #22
0
 def __call__(self, parser, namespace, values, option_string=None):
     input_options = getattr(namespace, 'input_file_options', {}) or {}
     inputs = KGTKArgumentParser.get_input_file_list(getattr(namespace, 'input_files', []))
     if len(inputs) < 1:
         raise KGTKException('out-of-place input option: %s' % option_string)
     # normalize path objects to strings:
     input_file = str(inputs[-1])
     # handle boolean args (also requires nargs=0):
     if self.type == bool:
         values = True
     # we use self.dest as the key for this particular option:
     input_options.setdefault(input_file, {})[self.dest] = values
     setattr(namespace, 'input_file_options', input_options)
Exemple #23
0
    def generate_prop_declaration_triple(self, node1: str, node2: str) -> bool:
        # update the known prop_types
        if node1 in self.prop_types:
            if not self.prop_declaration:
                raise KGTKException(
                    "Duplicated property definition of {} found!".format(
                        node1))
        else:
            self.prop_types[node1] = node2

        prop = WDProperty(node1, self.datatype_mapping[node2])
        self.doc.kg.add_subject(prop)
        return True
Exemple #24
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        properties: str = '',
        undirected: bool = False,
        strong: bool = False,

        # The following have been modified to postpone importing gtaph_tools.
        # ClusterComponents cann't be referenced here.
        cluster_name_method: typing.Optional[typing.Any] = None,
        cluster_name_separator: typing.Optional[str] = None,
        cluster_name_prefix: typing.Optional[str] = None,
        cluster_name_zfill: typing.Optional[int] = None,
        minimum_cluster_size: typing.Optional[int] = None,
        **kwargs  # Whatever KgtkFileOptions and KgtkValueOptions want.
) -> int:
    from pathlib import Path

    from kgtk.exceptions import KGTKException
    from kgtk.gt.connected_components import ConnectedComponents
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # It's OK to mention ClusterComponents here.
    cluster_name_method_x: ConnectedComponents.Method = \
        cluster_name_method if cluster_name_method is not None else ConnectedComponents.DEFAULT_CLUSTER_NAME_METHOD
    cluster_name_separator = ConnectedComponents.DEFAULT_CLUSTER_NAME_SEPARATOR if cluster_name_separator is None else cluster_name_separator
    cluster_name_prefix = ConnectedComponents.DEFAULT_CLUSTER_NAME_PREFIX if cluster_name_prefix is None else cluster_name_prefix
    cluster_name_zfill = ConnectedComponents.DEFAULT_CLUSTER_NAME_ZFILL if cluster_name_zfill is None else cluster_name_zfill
    minimum_cluster_size = ConnectedComponents.DEFAULT_MINIMUM_CLUSTER_SIZE if minimum_cluster_size is None else minimum_cluster_size

    cc: ConnectedComponents = ConnectedComponents(
        input_file_path=input_kgtk_file,
        output_file_path=output_kgtk_file,
        properties=properties,
        undirected=undirected,
        strong=strong,
        cluster_name_method=cluster_name_method_x,
        cluster_name_separator=cluster_name_separator,
        cluster_name_prefix=cluster_name_prefix,
        cluster_name_zfill=cluster_name_zfill,
        minimum_cluster_size=minimum_cluster_size,
    )

    try:
        cc.process()
        return 0
    except Exception as e:
        raise KGTKException(str(e))
Exemple #25
0
 def set_properties(self, prop_file: str):
     self.prop_types = {}
     if prop_file == "NONE":
         return
     with open(prop_file, "r") as fp:
         props = fp.readlines()
     for line in props[1:]:
         node1, _, node2, = line.split("\t")
         try:
             self.prop_types[node1] = self.datatype_mapping[node2.strip()]
         except:
             raise KGTKException(
                 "DataType {} of node {} is not supported.\n".format(
                     node2, node1))
Exemple #26
0
def run(input_files: KGTKFiles, output=None, gz=False, bz2=False, xz=False, _debug=False):
    """Run zconcat according to the provided command-line arguments.
    """
    # TO DO: figure out how to properly access shared --debug option
    try:
        inputs: typing.List[str] = [str(input_file) for input_file in KGTKArgumentParser.get_input_file_list(input_files)]
        commands = build_command(inputs=inputs, output=output, gz=gz, bz2=bz2, xz=xz)
        return run_sh_commands(commands, debug=_debug).exit_code
    except sh.SignalException_SIGPIPE:
        # cleanup in case we piped and terminated prematurely:
        sys.stdout.flush()
    except Exception as e:
        #import traceback
        #traceback.print_tb(sys.exc_info()[2], 10)
        raise KGTKException('INTERNAL ERROR: ' + str(e) + '\n')
Exemple #27
0
 def __init__(self, api, **kwargs):
     """Create a query object linked to the KypherApi 'api'.  All other
     arguments are passed to 'KypherQuery._define' (which see).
     """
     if not isinstance(api, KypherApi):
         raise KGTKException(
             'query needs to be linked to existing API object')
     self.api = api
     self.kgtk_query = None
     self.sql = None
     self.parameters = None
     self.exec_wrapper = None
     self.definition_args = kwargs
     self.timestamp = -1
     self._define(**kwargs)
Exemple #28
0
    def process_presorted_input(self, kr: KgtkReader, kw: KgtkWriter):
        """The input file must be sorted by node1."""

        if self.verbose:
            print("Processing presorted input.",
                  file=self.error_file,
                  flush=True)

        input_rows: int = 0

        each_node_attributes: Lexicalize.EACH_NODE_ATTRIBUTES = self.new_each_node_attributes(
        )

        previous_node_id: typing.Optional[str] = None
        current_process_node_id: typing.Optional[str] = None
        node_id: typing.Optional[str] = None

        rownum: int
        row: typing.List[str]
        for rownum, row in enumerate(kr):
            input_rows += 1
            node_id = row[kr.node1_column_idx]
            node_property: str = row[kr.label_column_idx]
            node_value: str = row[kr.node2_column_idx]

            # Ensure that the input file is sorted (node1 lowest to highest):
            if previous_node_id is None:
                each_node_attributes = self.new_each_node_attributes()
                previous_node_id = node_id
            elif previous_node_id > node_id:
                raise KGTKException("Row %d is out of order: %s > %s" %
                                    (rownum + 1, previous_node_id, node_id))
            elif previous_node_id < node_id:
                self.process_qnode(kw, previous_node_id, each_node_attributes)
                each_node_attributes = self.new_each_node_attributes()
                previous_node_id = node_id

            self.process_row(node_id, node_property, node_value,
                             each_node_attributes)

        if node_id is not None:
            # Processing the final qnode in the input file
            self.process_qnode(kw, node_id, each_node_attributes)

        if self.verbose:
            print("Processed %d input rows." % (input_rows),
                  file=self.error_file,
                  flush=True)
Exemple #29
0
def run(name, info, error):
    """
    Arguments here should be defined in `add_arguments` first.
    The return value (integer) will be the return code in shell. It will set to 0 if no value returns.
    Though you can return a non-zero value to indicate error, raise exceptions defined in kgtk.exceptions is preferred
    since this gives user an unified error code and message.
    """
    # import modules locally
    import socket
    from kgtk.exceptions import KGTKException

    if error:
        raise KGTKException('An error here\n')

    print('name: {}, info: {}\nhost: {}'.format(name, info,
                                                socket.gethostname()))
def run(**kwargs):
    from kgtk.utils.elasticsearch_manager import ElasticsearchManager
    try:

        ElasticsearchManager.build_kgtk_search_input(kwargs['input_file_path'], kwargs['label_properties'],
                                                     kwargs['mapping_file_path'], kwargs['output_file_path'],
                                                     alias_fields=kwargs['alias_properties'],
                                                     extra_alias_properties=kwargs['extra_alias_properties'],
                                                     pagerank_fields=kwargs['pagerank_properties'],
                                                     description_properties=kwargs['description_properties'],
                                                     add_text=kwargs['add_text'],
                                                     property_datatype_file=kwargs['property_datatype_file']
                                                     )
    except:
        message = 'Command: build-kgtk-search-input\n'
        message += 'Error Message:  {}\n'.format(traceback.format_exc())
        raise KGTKException(message)