Ejemplo n.º 1
0
def run(
        input_file: KGTKFiles,
        path_file: KGTKFiles,
        output_file: KGTKFiles,
        statistics_only: bool,
        undirected: bool,
        max_hops: int,
        source_column_name: typing.Optional[str],
        target_column_name: typing.Optional[str],
        shortest_path: bool,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool.all import find_vertex
    from graph_tool.topology import all_paths
    from graph_tool.topology import all_shortest_paths

    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    from kgtk.exceptions import KGTKException
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="input", fallback=True)
        path_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
            kwargs, who="path", fallback=True)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        path_kgtk_file: Path = KGTKArgumentParser.get_input_file(path_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        id_col = 'name'

        if verbose:
            print("Reading the path file: %s" % str(path_kgtk_file),
                  file=error_file,
                  flush=True)
        pairs = []
        pkr: KgtkReader = KgtkReader.open(
            path_kgtk_file,
            error_file=error_file,
            options=path_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        path_source_idx: int = pkr.get_node1_column_index(source_column_name)
        if path_source_idx < 0:
            print("Missing node1 (source) column name in the path file.",
                  file=error_file,
                  flush=True)

        path_target_idx: int = pkr.get_node2_column_index(target_column_name)
        if path_target_idx < 0:
            print("Missing node1 (target) column name in the path file.",
                  file=error_file,
                  flush=True)
        if path_source_idx < 0 or path_target_idx < 0:
            pkr.close()
            raise KGTKException("Exiting due to missing columns.")

        paths_read: int = 0
        path_row: typing.List[str]
        for path_row in pkr:
            paths_read += 1
            if len(path_row) != pkr.column_count:
                raise KGTKException(
                    "Exiting because line %d in the path file (%s) is the wrong length: %d columns expected, %d were read."
                    % (paths_read, str(path_kgtk_file), pkr.column_count,
                       len(path_row)))
            src: str = path_row[path_source_idx]
            tgt: str = path_row[path_target_idx]
            pairs.append((src, tgt))
        pkr.close()
        if verbose:
            print("%d path rows read" % paths_read,
                  file=error_file,
                  flush=True)
        if len(pairs) == 0:
            print("No path pairs found, the output will be empty.",
                  file=error_file,
                  flush=True)
        elif verbose:
            print("%d path pairs found" % len(pairs),
                  file=error_file,
                  flush=True)

        if verbose:
            print("Reading the input file: %s" % str(input_kgtk_file),
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=input_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        sub_index: int = kr.get_node1_column_index()
        if sub_index < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred_index: int = kr.get_label_column_index()
        if pred_index < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj_index: int = kr.get_node2_column_index()
        if obj_index < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        id_index: int = kr.get_id_column_index()
        if id_index < 0:
            print("Missing id column", file=error_file, flush=True)
        if sub_index < 0 or pred_index < 0 or obj_index < 0 or id_index < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred_index]
        id_col_name: str = kr.column_names[id_index]

        G = load_graph_from_kgtk(kr,
                                 directed=not undirected,
                                 ecols=(sub_index, obj_index),
                                 verbose=verbose,
                                 out=error_file)

        output_columns: typing.List[str] = ['node1', 'label', 'node2', 'id']
        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        id_count = 0
        if not statistics_only:
            for e in G.edges():
                sid, oid = e
                lbl = G.ep[predicate][e]
                kw.write([
                    G.vp[id_col][sid], lbl, G.vp[id_col][oid],
                    '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1
            if verbose:
                print("%d edges found." % id_count,
                      file=error_file,
                      flush=True)

        id_count = 0
        path_id = 0
        for pair in pairs:
            source_node, target_node = pair
            source_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=source_node)
            target_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=target_node)
            if len(source_ids) == 1 and len(target_ids) == 1:
                source_id = source_ids[0]
                target_id = target_ids[0]
                if shortest_path:
                    _all_paths = all_shortest_paths(G,
                                                    source_id,
                                                    target_id,
                                                    edges=True)
                else:
                    _all_paths = all_paths(G,
                                           source_id,
                                           target_id,
                                           cutoff=max_hops,
                                           edges=True)

                for path in _all_paths:
                    for edge_num, an_edge in enumerate(path):
                        edge_id = G.properties[('e', 'id')][an_edge]
                        node1: str = 'p%d' % path_id
                        kw.write([
                            node1,
                            str(edge_num), edge_id,
                            '{}-{}-{}'.format(node1, edge_num, id_count)
                        ])
                        id_count += 1
                    path_id += 1

        if verbose:
            print("%d paths contining %d edges found." % (path_id, id_count),
                  file=error_file,
                  flush=True)

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Ejemplo n.º 2
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        undirected: bool,
        compute_degrees: bool,
        compute_pagerank: bool,
        compute_hits: bool,
        log_file: str,
        statistics_only: bool,
        vertex_in_degree: str,
        vertex_out_degree: str,
        vertex_pagerank: str,
        vertex_auth: str,
        vertex_hubs: str,
        top_n: int,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from graph_tool import centrality
    from kgtk.exceptions import KGTKException
    import kgtk.gt.analysis_utils as gtanalysis
    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    v_prop_dict = {
        'vertex_pagerank': vertex_pagerank,
        'vertex_hubs': vertex_hubs,
        'vertex_auth': vertex_auth
    }
    try:

        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)

        # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later
        directions = ['in', 'out', 'total']
        id_col = 'name'
        output_columns = ["node1", "label", "node2", "id"]

        if verbose:
            print('loading the KGTK input file...\n',
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        sub: int = kr.get_node1_column_index()
        if sub < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred: int = kr.get_label_column_index()
        if pred < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj: int = kr.get_node2_column_index()
        if obj < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        if sub < 0 or pred < 0 or obj < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        predicate: str = kr.column_names[pred]

        G2 = load_graph_from_kgtk(kr,
                                  directed=not undirected,
                                  ecols=(sub, obj),
                                  verbose=verbose,
                                  out=error_file)
        if verbose:
            print('graph loaded! It has %d nodes and %d edges.' %
                  (G2.num_vertices(), G2.num_edges()),
                  file=error_file,
                  flush=True)

        kw: KgtkWriter = KgtkWriter.open(output_columns,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)

        with open(log_file, 'w') as writer:
            writer.write('graph loaded! It has %d nodes and %d edges\n' %
                         (G2.num_vertices(), G2.num_edges()))
            writer.write('\n###Top relations:\n')
            for rel, freq in gtanalysis.get_topN_relations(
                    G2, pred_property=predicate):
                writer.write('%s\t%d\n' % (rel, freq))

            if compute_degrees:
                writer.write('\n###Degrees:\n')
                for direction in directions:
                    degree_data = gtanalysis.compute_node_degree_hist(
                        G2, direction)
                    max_degree = len(degree_data) - 1
                    mean_degree, std_degree = gtanalysis.compute_avg_node_degree(
                        G2, direction)
                    writer.write(
                        '%s degree stats: mean=%f, std=%f, max=%d\n' %
                        (direction, mean_degree, std_degree, max_degree))

            if compute_pagerank:
                writer.write('\n###PageRank\n')
                v_pr = G2.new_vertex_property('float')
                centrality.pagerank(G2, prop=v_pr)
                G2.properties[('v', 'vertex_pagerank')] = v_pr
                writer.write('Max pageranks\n')
                result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank',
                                                     top_n, id_col)
                for n_id, n_label, pr in result:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr))

            if compute_hits:
                writer.write('\n###HITS\n')
                hits_eig, G2.vp['vertex_hubs'], G2.vp[
                    'vertex_auth'] = gtanalysis.compute_hits(G2)
                writer.write('HITS hubs\n')
                main_hubs = gtanalysis.get_topn_indices(
                    G2, 'vertex_hubs', top_n, id_col)
                for n_id, n_label, hubness in main_hubs:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness))
                writer.write('HITS auth\n')
                main_auth = gtanalysis.get_topn_indices(
                    G2, 'vertex_auth', top_n, id_col)
                for n_id, n_label, authority in main_auth:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority))

        id_count = 0
        if not statistics_only:
            for e in G2.edges():
                sid, oid = e
                lbl = G2.ep[predicate][e]
                kw.write([
                    G2.vp[id_col][sid], lbl, G2.vp[id_col][oid],
                    '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count)
                ])
                id_count += 1

        id_count = 0
        for v in G2.vertices():
            v_id = G2.vp[id_col][v]
            kw.write([
                v_id, vertex_in_degree,
                str(v.in_degree()), '{}-{}-{}'.format(v_id, vertex_in_degree,
                                                      id_count)
            ])
            id_count += 1
            kw.write([
                v_id, vertex_out_degree,
                str(v.out_degree()), '{}-{}-{}'.format(v_id, vertex_out_degree,
                                                       id_count)
            ])
            id_count += 1

            for vprop in G2.vertex_properties.keys():
                if vprop == id_col:
                    continue
                kw.write([
                    v_id, v_prop_dict[vprop],
                    str(G2.vp[vprop][v]),
                    '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count)
                ])
                id_count += 1

        kw.close()
        kr.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Ejemplo n.º 3
0
    def process(self):
        input_kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            who="input",
            options=self.input_reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        input_key_columns: typing.List[int] = self.get_key_columns(
            input_kr, "input")
        label_col_idx = input_key_columns[1]
        label = input_kr.column_names[label_col_idx]

        g = load_graph_from_kgtk(input_kr, directed=not self.undirected)

        es = []
        header = ['node1', 'label', 'node2']
        if self.properties:
            properties = self.properties.split(',')
            for e in properties:
                es += (find_edge(g, g.edge_properties[label], e))
            g.clear_edges()
            g.add_edge_list(list(set(es)))
        comp, hist = label_components(g, directed=self.strong)

        ew: KgtkWriter = KgtkWriter.open(header,
                                         self.output_file_path,
                                         mode=input_kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)

        clusters: typing.MutableMapping[str, typing.List[str]] = dict()
        cluster_id: str
        name: str

        v: int
        for v, c in enumerate(comp):
            name = g.vertex_properties['name'][v]
            cluster_id = str(c)
            if cluster_id not in clusters:
                clusters[cluster_id] = [name]
            else:
                clusters[cluster_id].append(name)

        trimmed_clusters: typing.MutableMapping[str, typing.List[str]] = dict()
        for cluster_id in clusters.keys():
            if len(clusters[cluster_id]) >= self.minimum_cluster_size:
                trimmed_clusters[cluster_id] = clusters[cluster_id]

        named_clusters: typing.MutableMapping[
            str, typing.List[str]] = self.name_clusters(trimmed_clusters)
        for cluster_id in sorted(named_clusters.keys()):
            for name in sorted(named_clusters[cluster_id]):
                ew.write([name, 'connected_component', cluster_id])

        ew.close()
Ejemplo n.º 4
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        undirected: bool,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    # import modules locally
    from pathlib import Path
    import sys

    from kgtk.exceptions import KGTKException
    import kgtk.gt.analysis_utils as gtanalysis
    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    try:
        # Select where to send error messages, defaulting to stderr.
        error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

        # Build the option structures.
        reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
        value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

        input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
        output_gt_file: typing.Optional[
            Path] = KGTKArgumentParser.get_optional_output_file(output_file)

        if verbose:
            print('loading the KGTK input file...\n',
                  file=error_file,
                  flush=True)
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
        sub: int = kr.get_node1_column_index()
        if sub < 0:
            print("Missing node1 (subject) column.",
                  file=error_file,
                  flush=True)
        pred: int = kr.get_label_column_index()
        if pred < 0:
            print("Missing label (predicate) column.",
                  file=error_file,
                  flush=True)
        obj: int = kr.get_node2_column_index()
        if obj < 0:
            print("Missing node2 (object) column", file=error_file, flush=True)
        if sub < 0 or pred < 0 or obj < 0:
            kr.close()
            raise KGTKException("Exiting due to missing columns.")

        G2 = load_graph_from_kgtk(kr,
                                  directed=not undirected,
                                  ecols=(sub, obj),
                                  verbose=verbose,
                                  out=error_file)

        if verbose:
            print('graph loaded! It has %d nodes and %d edges.' %
                  (G2.num_vertices(), G2.num_edges()),
                  file=error_file,
                  flush=True)
            print('\n###Top relations:', file=error_file, flush=True)
            for rel, freq in gtanalysis.get_topN_relations(
                    G2, pred_property=kr.column_names[pred]):
                print('%s\t%d' % (rel, freq), file=error_file, flush=True)

        if output_gt_file is not None:
            if verbose:
                print('\nNow saving the graph to %s' % str(output_gt_file),
                      file=error_file,
                      flush=True)
            G2.save(str(output_gt_file))
            if verbose:
                print('Done saving the graph.', file=error_file, flush=True)
    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Ejemplo n.º 5
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        root: typing.Optional[typing.List[str]],
        rootfile,
        rootfilecolumn,
        subject_column_name: typing.Optional[str],
        object_column_name: typing.Optional[str],
        predicate_column_name: typing.Optional[str],
        props: typing.Optional[typing.List[str]],
        props_file: typing.Optional[str],
        propsfilecolumn: typing.Optional[str],
        inverted: bool,
        inverted_props: typing.Optional[typing.List[str]],
        inverted_props_file: typing.Optional[str],
        invertedpropsfilecolumn: typing.Optional[str],
        undirected: bool,
        undirected_props: typing.Optional[typing.List[str]],
        undirected_props_file: typing.Optional[str],
        undirectedpropsfilecolumn: typing.Optional[str],
        label: str,
        selflink_bool: bool,
        show_properties: bool,
        breadth_first: bool,
        depth_limit: typing.Optional[int],
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    import sys
    import csv
    from pathlib import Path
    import time
    from graph_tool.search import dfs_iterator, bfs_iterator, bfs_search, BFSVisitor
    # from graph_tool import load_graph_from_csv
    from graph_tool.util import find_edge
    from kgtk.exceptions import KGTKException
    from kgtk.cli_argparse import KGTKArgumentParser

    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    #Graph-tool names columns that are not subject or object c0, c1... This function finds the number that graph tool assigned to the predicate column
    def find_pred_position(sub, pred, obj):
        if pred < sub and pred < obj:
            return pred
        elif (pred > sub and pred < obj) or (pred < sub and pred > obj):
            return pred - 1
        else:
            return pred - 2

    def get_edges_by_edge_prop(g, p, v):
        return find_edge(g, prop=g.properties[('e', p)], match=v)

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="input", fallback=True)
    root_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="root", fallback=True)
    props_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="props", fallback=True)
    undirected_props_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="undirected_props", fallback=True)
    inverted_props_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="inverted_props", fallback=True)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    if root is None:
        root = []  # This simplifies matters.

    if props is None:
        props = []  # This simplifies matters.

    if undirected_props is None:
        undirected_props = []  # This simplifies matters.

    if inverted_props is None:
        inverted_props = []  # This simplifies matters.

    if show_options:
        if root is not None:
            print("--root %s" % " ".join(root), file=error_file)
        if rootfile is not None:
            print("--rootfile=%s" % rootfile, file=error_file)
        if rootfilecolumn is not None:
            print("--rootfilecolumn=%s" % rootfilecolumn, file=error_file)
        if subject_column_name is not None:
            print("--subj=%s" % subject_column_name, file=error_file)
        if object_column_name is not None:
            print("--obj=%s" % object_column_name, file=error_file)
        if predicate_column_name is not None:
            print("--pred=%s" % predicate_column_name, file=error_file)

        if props is not None:
            print("--props=%s" % " ".join(props), file=error_file)
        if props_file is not None:
            print("--props-file=%s" % props_file, file=error_file)
        if propsfilecolumn is not None:
            print("--propsfilecolumn=%s" % propsfilecolumn, file=error_file)

        print("--inverted=%s" % str(inverted), file=error_file)
        if inverted_props is not None:
            print("--inverted-props=%s" % " ".join(inverted_props),
                  file=error_file)
        if inverted_props_file is not None:
            print("--inverted-props-file=%s" % inverted_props_file,
                  file=error_file)
        if invertedpropsfilecolumn is not None:
            print("--invertedpropsfilecolumn=%s" % invertedpropsfilecolumn,
                  file=error_file)

        print("--undirected=%s" % str(undirected), file=error_file)
        if undirected_props is not None:
            print("--undirected-props=%s" % " ".join(undirected_props),
                  file=error_file)
        if undirected_props_file is not None:
            print("--undirected-props-file=%s" % undirected_props_file,
                  file=error_file)
        if undirectedpropsfilecolumn is not None:
            print("--undirectedpropsfilecolumn=%s" % undirectedpropsfilecolumn,
                  file=error_file)

        print("--label=%s" % label, file=error_file)
        print("--selflink=%s" % str(selflink_bool), file=error_file)
        print("--breadth-first=%s" % str(breadth_first), file=error_file)
        if depth_limit is not None:
            print("--depth-limit=%d" % depth_limit, file=error_file)
        input_reader_options.show(out=error_file)
        root_reader_options.show(out=error_file)
        props_reader_options.show(out=error_file)
        undirected_props_reader_options.show(out=error_file)
        inverted_props_reader_options.show(out=error_file)
        value_options.show(out=error_file)
        KgtkReader.show_debug_arguments(errors_to_stdout=errors_to_stdout,
                                        errors_to_stderr=errors_to_stderr,
                                        show_options=show_options,
                                        verbose=verbose,
                                        very_verbose=very_verbose,
                                        out=error_file)
        print("=======", file=error_file, flush=True)

    if inverted and (len(inverted_props) > 0
                     or inverted_props_file is not None):
        raise KGTKException(
            "--inverted is not allowed with --inverted-props or --inverted-props-file"
        )

    if undirected and (len(undirected_props) > 0
                       or undirected_props_file is not None):
        raise KGTKException(
            "--undirected is not allowed with --undirected-props or --undirected-props-file"
        )

    if depth_limit is not None:
        if not breadth_first:
            raise KGTKException(
                "--depth-limit is not allowed without --breadth-first")
        if depth_limit <= 0:
            raise KGTKException("--depth-limit requires a positive argument")

    root_set: typing.Set = set()

    if rootfile is not None:
        if verbose:
            print("Reading the root file %s" % repr(rootfile),
                  file=error_file,
                  flush=True)
        try:
            root_kr: KgtkReader = KgtkReader.open(
                Path(rootfile),
                error_file=error_file,
                who="root",
                options=root_reader_options,
                value_options=value_options,
                verbose=verbose,
                very_verbose=very_verbose,
            )
        except SystemExit:
            raise KGTKException("Exiting.")

        rootcol: int
        if root_kr.is_edge_file:
            rootcol = int(
                rootfilecolumn
            ) if rootfilecolumn is not None and rootfilecolumn.isdigit(
            ) else root_kr.get_node1_column_index(rootfilecolumn)
        elif root_kr.is_node_file:
            rootcol = int(
                rootfilecolumn
            ) if rootfilecolumn is not None and rootfilecolumn.isdigit(
            ) else root_kr.get_id_column_index(rootfilecolumn)
        elif rootfilecolumn is not None:
            rootcol = int(
                rootfilecolumn
            ) if rootfilecolumn is not None and rootfilecolumn.isdigit(
            ) else root_kr.column_name_map.get(rootfilecolumn, -1)
        else:
            root_kr.close()
            raise KGTKException(
                "The root file is neither an edge nor a node file and the root column name was not supplied."
            )

        if rootcol < 0:
            root_kr.close()
            raise KGTKException("Unknown root column %s" %
                                repr(rootfilecolumn))

        for row in root_kr:
            rootnode: str = row[rootcol]
            root_set.add(rootnode)
        root_kr.close()

    if len(root) > 0:
        if verbose:
            print("Adding root nodes from the command line.",
                  file=error_file,
                  flush=True)
        root_group: str
        for root_group in root:
            r: str
            for r in root_group.split(','):
                if verbose:
                    print("... adding %s" % repr(r),
                          file=error_file,
                          flush=True)
                root_set.add(r)
    if len(root_set) == 0:
        print(
            "Warning: No nodes in the root set, the output file will be empty.",
            file=error_file,
            flush=True)
    elif verbose:
        print("%d nodes in the root set." % len(root_set),
              file=error_file,
              flush=True)

    property_set: typing.Set[str] = set()
    if props_file is not None:
        if verbose:
            print("Reading the root file %s" % repr(props_file),
                  file=error_file,
                  flush=True)
        try:
            props_kr: KgtkReader = KgtkReader.open(
                Path(props_file),
                error_file=error_file,
                who="props",
                options=props_reader_options,
                value_options=value_options,
                verbose=verbose,
                very_verbose=very_verbose,
            )
        except SystemExit:
            raise KGTKException("Exiting.")

        propscol: int
        if props_kr.is_edge_file:
            propscol = int(
                propsfilecolumn
            ) if propsfilecolumn is not None and propsfilecolumn.isdigit(
            ) else props_kr.get_node1_column_index(propsfilecolumn)
        elif props_kr.is_node_file:
            propscol = int(
                propsfilecolumn
            ) if propsfilecolumn is not None and propsfilecolumn.isdigit(
            ) else props_kr.get_id_column_index(propsfilecolumn)
        elif propsfilecolumn is not None:
            propscol = int(
                propsfilecolumn
            ) if propsfilecolumn is not None and propsfilecolumn.isdigit(
            ) else props_kr.column_name_map.get(propsfilecolumn, -1)
        else:
            props_kr.close()
            raise KGTKException(
                "The props file is neither an edge nor a node file and the root column name was not supplied."
            )

        if propscol < 0:
            props_kr.close()
            raise KGTKException("Unknown props column %s" %
                                repr(propsfilecolumn))

        for row in props_kr:
            property_name: str = row[propscol]
            property_set.add(property_name)
        props_kr.close()

    if len(props) > 0:
        # Filter the graph, G, to include only edges where the predicate (label)
        # column contains one of the selected properties.

        prop_group: str
        for prop_group in props:
            prop: str
            for prop in prop_group.split(','):
                property_set.add(prop)
    if verbose and len(property_set) > 0:
        print("property set=%s" % " ".join(sorted(list(property_set))),
              file=error_file,
              flush=True)

    undirected_property_set: typing.Set[str] = set()
    if undirected_props_file is not None:
        if verbose:
            print("Reading the undirected properties file %s" %
                  repr(undirected_props_file),
                  file=error_file,
                  flush=True)
        try:
            undirected_props_kr: KgtkReader = KgtkReader.open(
                Path(undirected_props_file),
                error_file=error_file,
                who="undirected_props",
                options=undirected_props_reader_options,
                value_options=value_options,
                verbose=verbose,
                very_verbose=very_verbose,
            )
        except SystemExit:
            raise KGTKException("Exiting.")

        undirected_props_col: int
        if undirected_props_kr.is_edge_file:
            undirected_props_col = int(
                undirectedpropsfilecolumn
            ) if undirectedpropsfilecolumn is not None and undirectedpropsfilecolumn.isdigit(
            ) else undirected_props_kr.get_node1_column_index(
                undirectedpropsfilecolumn)
        elif undirected_props_kr.is_node_file:
            undirected_props_col = int(
                undirectedpropsfilecolumn
            ) if undirectedpropsfilecolumn is not None and undirectedpropsfilecolumn.isdigit(
            ) else undirected_props_kr.get_id_column_index(
                undirectedpropsfilecolumn)
        elif undirectedpropsfilecolumn is not None:
            undirected_props_col = int(
                undirectedpropsfilecolumn
            ) if undirectedpropsfilecolumn is not None and undirectedpropsfilecolumn.isdigit(
            ) else undirected_props_kr.column_name_map.get(
                undirectedpropsfilecolumn, -1)
        else:
            undirected_props_kr.close()
            raise KGTKException(
                "The undirected props file is neither an edge nor a node file and the root column name was not supplied."
            )

        if undirected_props_col < 0:
            undirected_props_kr.close()
            raise KGTKException("Unknown undirected properties column %s" %
                                repr(undirectedpropsfilecolumn))

        for row in undirected_props_kr:
            undirected_property_name: str = row[undirected_props_col]
            undirected_property_set.add(undirected_property_name)
        undirected_props_kr.close()
    if len(undirected_props) > 0:
        # Edges where the predicate (label) column contains one of the selected
        # properties will be treated as undirected links.

        und_prop_group: str
        for und_prop_group in undirected_props:
            und_prop: str
            for und_prop in und_prop_group.split(','):
                undirected_property_set.add(und_prop)
    if verbose and len(undirected_property_set) > 0:
        print("undirected property set=%s" %
              " ".join(sorted(list(undirected_property_set))),
              file=error_file,
              flush=True)

    inverted_property_set: typing.Set[str] = set()
    if inverted_props_file is not None:
        if verbose:
            print("Reading the inverted properties file %s" %
                  repr(inverted_props_file),
                  file=error_file,
                  flush=True)
        try:
            inverted_props_kr: KgtkReader = KgtkReader.open(
                Path(inverted_props_file),
                error_file=error_file,
                who="inverted_props",
                options=inverted_props_reader_options,
                value_options=value_options,
                verbose=verbose,
                very_verbose=very_verbose,
            )
        except SystemExit:
            raise KGTKException("Exiting.")

        inverted_props_col: int
        if inverted_props_kr.is_edge_file:
            inverted_props_col = int(
                invertedpropsfilecolumn
            ) if invertedpropsfilecolumn is not None and invertedpropsfilecolumn.isdigit(
            ) else inverted_props_kr.get_node1_column_index(
                invertedpropsfilecolumn)
        elif inverted_props_kr.is_node_file:
            inverted_props_col = int(
                invertedpropsfilecolumn
            ) if invertedpropsfilecolumn is not None and invertedpropsfilecolumn.isdigit(
            ) else inverted_props_kr.get_id_column_index(
                invertedpropsfilecolumn)
        elif invertedpropsfilecolumn is not None:
            inverted_props_col = int(
                invertedpropsfilecolumn
            ) if invertedpropsfilecolumn is not None and invertedpropsfilecolumn.isdigit(
            ) else inverted_props_kr.column_name_map.get(
                invertedpropsfilecolumn, -1)
        else:
            inverted_props_kr.close()
            raise KGTKException(
                "The inverted props file is neither an edge nor a node file and the root column name was not supplied."
            )

        if inverted_props_col < 0:
            inverted_props_kr.close()
            raise KGTKException("Unknown inverted properties column %s" %
                                repr(invertedpropsfilecolumn))

        for row in inverted_props_kr:
            inverted_property_name: str = row[inverted_props_col]
            inverted_property_set.add(inverted_property_name)
        inverted_props_kr.close()

    if len(inverted_props) > 0:
        # Edges where the predicate (label) column contains one of the selected
        # properties will have the source and target columns swapped.

        inv_prop_group: str
        for inv_prop_group in inverted_props:
            inv_prop: str
            for inv_prop in inv_prop_group.split(','):
                inverted_property_set.add(inv_prop)
    if verbose and len(inverted_property_set):
        print("inverted property set=%s" %
              " ".join(sorted(list(inverted_property_set))),
              file=error_file,
              flush=True)

    try:
        kr: KgtkReader = KgtkReader.open(
            input_kgtk_file,
            error_file=error_file,
            who="input",
            options=input_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )
    except SystemExit:
        raise KGTKException("Exiting.")

    sub: int = kr.get_node1_column_index(subject_column_name)
    if sub < 0:
        print("Unknown subject column %s" % repr(subject_column_name),
              file=error_file,
              flush=True)

    pred: int = kr.get_label_column_index(predicate_column_name)
    if pred < 0:
        print("Unknown predicate column %s" % repr(predicate_column_name),
              file=error_file,
              flush=True)

    obj: int = kr.get_node2_column_index(object_column_name)
    if obj < 0:
        print("Unknown object column %s" % repr(object_column_name),
              file=error_file,
              flush=True)

    if sub < 0 or pred < 0 or obj < 0:
        kr.close()
        raise KGTKException("Exiting due to unknown column.")

    if verbose:
        print("special columns: sub=%d pred=%d obj=%d" % (sub, pred, obj),
              file=error_file,
              flush=True)

    # G = load_graph_from_csv(filename,not(undirected),skip_first=not(header_bool),hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj))
    G = load_graph_from_kgtk(kr,
                             directed=not undirected,
                             inverted=inverted,
                             ecols=(sub, obj),
                             pcol=pred,
                             pset=property_set,
                             upset=undirected_property_set,
                             ipset=inverted_property_set,
                             verbose=verbose,
                             out=error_file)

    name = G.vp[
        "name"]  # Get the vertex name property map (vertex to ndoe1 (subject) name)

    if show_properties:
        print("Graph name=%s" % repr(name), file=error_file, flush=True)
        print("Graph properties:", file=error_file, flush=True)
        key: typing.Any
        for key in G.properties:
            print("    %s: %s" % (repr(key), repr(G.properties[key])),
                  file=error_file,
                  flush=True)

    index_list = []
    for v in G.vertices():
        if name[v] in root_set:
            index_list.append(v)
    if len(index_list) == 0:
        print(
            "Warning: No root nodes found in the graph, the output file will be empty.",
            file=error_file,
            flush=True)
    elif verbose:
        print("%d root nodes found in the graph." % len(index_list),
              file=error_file,
              flush=True)

    output_header: typing.List[str] = ['node1', 'label', 'node2']

    try:
        kw: KgtkWriter = KgtkWriter.open(output_header,
                                         output_kgtk_file,
                                         mode=KgtkWriter.Mode.EDGE,
                                         require_all_columns=True,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=False,
                                         verbose=verbose,
                                         very_verbose=very_verbose)
    except SystemExit:
        raise KGTKException("Exiting.")

    for index in index_list:
        if selflink_bool:
            kw.writerow([name[index], label, name[index]])

        if breadth_first:
            if depth_limit is None:
                for e in bfs_iterator(G, G.vertex(index)):
                    kw.writerow([name[index], label, name[e.target()]])

            else:

                class DepthExceeded(Exception):
                    pass

                class DepthLimitedVisitor(BFSVisitor):
                    def __init__(self, name, pred, dist):
                        self.name = name
                        self.pred = pred
                        self.dist = dist

                    def tree_edge(self, e):
                        self.pred[e.target()] = int(e.source())
                        newdist = self.dist[e.source()] + 1
                        if depth_limit is not None and newdist > depth_limit:
                            raise DepthExceeded
                        self.dist[e.target()] = newdist
                        kw.writerow([name[index], label, name[e.target()]])

                dist = G.new_vertex_property("int")
                pred = G.new_vertex_property("int64_t")
                try:
                    bfs_search(G, G.vertex(index),
                               DepthLimitedVisitor(name, pred, dist))
                except DepthExceeded:
                    pass
        else:
            for e in dfs_iterator(G, G.vertex(index)):
                kw.writerow([name[index], label, name[e.target()]])

    kw.close()
    kr.close()
Ejemplo n.º 6
0
def run(
        input_file: KGTKFiles,
        output_file: KGTKFiles,
        root: typing.Optional[typing.List[str]],
        rootfile,
        rootfilecolumn,
        subject_column_name: typing.Optional[str],
        object_column_name: typing.Optional[str],
        predicate_column_name: typing.Optional[str],
        props: typing.Optional[typing.List[str]],
        undirected: bool,
        label: str,
        selflink_bool: bool,
        show_properties: bool,
        breadth_first: bool,
        errors_to_stdout: bool,
        errors_to_stderr: bool,
        show_options: bool,
        verbose: bool,
        very_verbose: bool,
        **kwargs,  # Whatever KgtkFileOptions and KgtkValueOptions want.
):
    import sys
    import csv
    from pathlib import Path
    import time
    from graph_tool.search import dfs_iterator, bfs_iterator
    # from graph_tool import load_graph_from_csv
    from graph_tool.util import find_edge
    from kgtk.exceptions import KGTKException
    from kgtk.cli_argparse import KGTKArgumentParser

    from kgtk.gt.gt_load import load_graph_from_kgtk
    from kgtk.io.kgtkwriter import KgtkWriter
    from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions
    from kgtk.value.kgtkvalueoptions import KgtkValueOptions

    #Graph-tool names columns that are not subject or object c0, c1... This function finds the number that graph tool assigned to the predicate column
    def find_pred_position(sub, pred, obj):
        if pred < sub and pred < obj:
            return pred
        elif (pred > sub and pred < obj) or (pred < sub and pred > obj):
            return pred - 1
        else:
            return pred - 2

    def get_edges_by_edge_prop(g, p, v):
        return find_edge(g, prop=g.properties[('e', p)], match=v)

    input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file)
    output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file)

    # Select where to send error messages, defaulting to stderr.
    error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr

    # Build the option structures.
    input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="input", fallback=True)
    root_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(
        kwargs, who="root", fallback=True)
    value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)

    if root is None:
        root = []  # This simplifies matters.

    if props is None:
        props = []  # This simplifies matters.

    if show_options:
        if root is not None:
            print("--root %s" % " ".join(root), file=error_file)
        if rootfile is not None:
            print("--rootfile=%s" % rootfile, file=error_file)
        if subject_column_name is not None:
            print("--subj=%s" % subject_column_name, file=error_file)
        if object_column_name is not None:
            print("--obj=%s" % object_column_name, file=error_file)
        if predicate_column_name is not None:
            print("--pred=%s" % predicate_column_name, file=error_file)
        if props is not None:
            print("--props=%s" % " ".join(props), file=error_file)
        print("--undirected=%s" % str(undirected), file=error_file)
        print("--label=%s" % label, file=error_file)
        print("--selflink=%s" % str(selflink_bool), file=error_file)
        print("--breadth-first=%s" % str(breadth_first), file=error_file)
        input_reader_options.show(out=error_file)
        root_reader_options.show(out=error_file)
        value_options.show(out=error_file)
        KgtkReader.show_debug_arguments(errors_to_stdout=errors_to_stdout,
                                        errors_to_stderr=errors_to_stderr,
                                        show_options=show_options,
                                        verbose=verbose,
                                        very_verbose=very_verbose,
                                        out=error_file)
        print("=======", file=error_file, flush=True)

    root_set: typing.Set = set()
    property_list: typing.List = list()

    if rootfile is not None:
        if verbose:
            print("Reading the root file %s" % repr(rootfile),
                  file=error_file,
                  flush=True)
        root_kr: KgtkReader = KgtkReader.open(
            Path(rootfile),
            error_file=error_file,
            who="root",
            options=root_reader_options,
            value_options=value_options,
            verbose=verbose,
            very_verbose=very_verbose,
        )

        rootcol: int
        if root_kr.is_edge_file:
            rootcol = int(
                rootfilecolumn
            ) if rootfilecolumn is not None and rootfilecolumn.isdigit(
            ) else root_kr.get_node1_column_index(rootfilecolumn)
        elif root_kr.is_node_file:
            rootcol = int(
                rootfilecolumn
            ) if rootfilecolumn is not None and rootfilecolumn.isdigit(
            ) else root_kr.get_id_column_index(rootfilecolumn)
        elif rootfilecolumn is not None:
            rootcol = int(
                rootfilecolumn
            ) if rootfilecolumn is not None and rootfilecolumn.isdigit(
            ) else root_kr.column_name_map.get(rootfilecolumn, -1)
        else:
            root_kr.close()
            raise KGTKException(
                "The root file is neither an edge nor a node file and the root column name was not supplied."
            )

        if rootcol < 0:
            root_kr.close()
            raise KGTKException("Unknown root column %s" %
                                repr(rootfilecolumn))

        for row in root_kr:
            rootnode: str = row[rootcol]
            root_set.add(rootnode)
        root_kr.close()

    if len(root) > 0:
        if verbose:
            print("Adding root nodes from the command line.",
                  file=error_file,
                  flush=True)
        root_group: str
        for root_group in root:
            r: str
            for r in root_group.split(','):
                if verbose:
                    print("... adding %s" % repr(r),
                          file=error_file,
                          flush=True)
                root_set.add(r)
    if len(root_set) == 0:
        print(
            "Warning: No nodes in the root set, the output file will be empty.",
            file=error_file,
            flush=True)
    elif verbose:
        print("%d nodes in the root set." % len(root_set),
              file=error_file,
              flush=True)

    kr: KgtkReader = KgtkReader.open(
        input_kgtk_file,
        error_file=error_file,
        who="input",
        options=input_reader_options,
        value_options=value_options,
        verbose=verbose,
        very_verbose=very_verbose,
    )
    sub: int = kr.get_node1_column_index(subject_column_name)
    if sub < 0:
        print("Unknown subject column %s" % repr(subject_column_name),
              file=error_file,
              flush=True)

    pred: int = kr.get_label_column_index(predicate_column_name)
    if pred < 0:
        print("Unknown predicate column %s" % repr(predicate_column_name),
              file=error_file,
              flush=True)

    obj: int = kr.get_node2_column_index(object_column_name)
    if obj < 0:
        print("Unknown object column %s" % repr(object_column_name),
              file=error_file,
              flush=True)

    if sub < 0 or pred < 0 or obj < 0:
        kr.close()
        raise KGTKException("Exiting due to unknown column.")

    if verbose:
        print("special columns: sub=%d pred=%d obj=%d" % (sub, pred, obj),
              file=error_file,
              flush=True)

    # G = load_graph_from_csv(filename,not(undirected),skip_first=not(header_bool),hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj))
    G = load_graph_from_kgtk(kr,
                             directed=not undirected,
                             ecols=(sub, obj),
                             verbose=verbose,
                             out=error_file)

    name = G.vp[
        "name"]  # Get the vertix name property map (vertex to ndoe1 (subject) name)

    if show_properties:
        print("Graph name=%s" % name, file=error_file, flush=True)
        print("Graph properties:", file=error_file, flush=True)
        key: typing.Any
        for key in G.properties:
            print("    %s: %s" % (repr(key), repr(G.properties[key])),
                  file=error_file,
                  flush=True)

    index_list = []
    for v in G.vertices():
        if name[v] in root_set:
            index_list.append(v)
    if len(index_list) == 0:
        print(
            "Warning: No root nodes found in the graph, the output file will be empty.",
            file=error_file,
            flush=True)
    elif verbose:
        print("%d root nodes found in the graph." % len(index_list),
              file=error_file,
              flush=True)

    if len(props) > 0:
        # Since the root file is a KGTK file, the columns will have names.
        # pred_label: str = 'c'+str(find_pred_position(sub, pred, obj))
        pred_label: str = kr.column_names[pred]
        if verbose:
            print("pred_label=%s" % repr(pred_label),
                  file=error_file,
                  flush=True)

        property_list = []
        prop_group: str
        for prop_group in props:
            prop: str
            for prop in prop_group.split(','):
                property_list.append(prop)
        if verbose:
            print("property list=%s" % " ".join(property_list),
                  file=error_file,
                  flush=True)

        edge_filter_set = set()
        for prop in property_list:
            edge_filter_set.update(get_edges_by_edge_prop(G, pred_label, prop))
        G.clear_edges()
        G.add_edge_list(list(edge_filter_set))

    output_header: typing.List[str] = ['node1', 'label', 'node2']

    kw: KgtkWriter = KgtkWriter.open(output_header,
                                     output_kgtk_file,
                                     mode=KgtkWriter.Mode.EDGE,
                                     require_all_columns=True,
                                     prohibit_extra_columns=True,
                                     fill_missing_columns=False,
                                     verbose=verbose,
                                     very_verbose=very_verbose)
    for index in index_list:
        if selflink_bool:
            kw.writerow([name[index], label, name[index]])

        if breadth_first:
            for e in bfs_iterator(G, G.vertex(index)):
                kw.writerow([name[index], label, name[e.target()]])
        else:
            for e in dfs_iterator(G, G.vertex(index)):
                kw.writerow([name[index], label, name[e.target()]])

    kw.close()
    kr.close()