Esempio n. 1
0
def read_graph(filename, direction):
    graph = gt.load_graph_from_csv(filename,
                                   directed=direction,
                                   hashed=True,
                                   string_vals=True,
                                   csv_options={'delimiter': ' '})
    return graph
Esempio n. 2
0
def run_motif_significance(graph,
                           directed=True,
                           data_loc="../data/",
                           motif_size=3,
                           n_shuffles=16,
                           s_model='uncorrelated'):
    """Run z-score computation for all `motif_size` subgraph
    on the given `graph`. By default, graph is loaded as a directed graph_tool
    instance.
    Parameters
    ==========
        graph: name of the graph file."""
    f_name = data_loc + graph + ".edges"
    g = load_graph_from_csv(f_name,
                            directed,
                            csv_options={
                                'quotechar': '"',
                                'delimiter': ' '
                            })
    m, z = motif_significance(g, motif_size, n_shuffles, shuffle_model=s_model)
    motif_annotation = str(motif_size) + 'm' if directed else str(
        motif_size) + 'um'
    output_name = "{}{}_{}.{}".format(data_loc, graph, motif_annotation,
                                      "motifslog")
    return write_motifs_results(output_name, m, z, n_shuffles, s_model)
Esempio n. 3
0
def load_graph(args: argparse.Namespace) -> Tuple[Graph, np.ndarray]:
    """Loads the graph and the truth partition.

    Parameters
    ----------
    args : argparse.Namespace
        the command-line arguments passed to the program

    Returns
    -------
    graph : Graph
        the loaded graph
    assignment : np.ndarray[int]
        the true vertex-to-community membership array
    """
    input_filename = build_filepath(args)
    if args.gtload:
        graph = load_graph_from_csv(input_filename + ".tsv",
                                    not args.undirected,
                                    csv_options={'delimiter': args.delimiter})
    else:
        graph = _load_graph(input_filename)
    print(graph)
    true_membership = load_true_membership(input_filename,
                                           graph.num_vertices())

    if args.verbose:
        print('Number of vertices: {}'.format(graph.num_vertices()))
        print('Number of edges: {}'.format(graph.num_edges()))
    if args.degrees:
        save_degree_distribution(args, graph)
    return graph, true_membership
Esempio n. 4
0
def run(filename, header_bool, sub, obj, props, directed, output):
    # import modules locally
    import socket
    from graph_tool import load_graph_from_csv
    from kgtk.exceptions import KGTKException

    try:

        p = props.split(',')
        print('loading the TSV graph now ...')
        #filename='/nas/home/ilievski/kgtk/data/conceptnet_first10.tsv'
        #G2=load_graph_from_csv(filename)
        #print('yo')
        G2 = load_graph_from_csv(filename,
                                 skip_first=header_bool,
                                 directed=directed,
                                 hashed=True,
                                 ecols=[sub, obj],
                                 eprop_names=props.split(','),
                                 csv_options={'delimiter': '\t'})

        print('graph loaded! It has %d nodes and %d edges' %
              (G2.num_vertices(), G2.num_edges()))

        if output:
            print('now saving the graph to %s' % output)
            G2.save(output)
    except:
        raise KGTKException
def datafile_to_graph(filename):
    return graph_tool.load_graph_from_csv(
        filename,
        directed=True,
        string_vals=False,
        eprop_types=['int', 'int'],
        eprop_names=[time_start_key, time_end_key],
        csv_options={"delimiter": " "})
Esempio n. 6
0
def run(filename, directed, log_file, output):
    from kgtk.exceptions import KGTKException

    def infer_index(h, options=[]):
        for o in options:
            if o in h:
                return h.index(o)
        return -1

    def infer_predicate(h, options=[]):
        for o in options:
            if o in h:
                return o
        return ''

    try:
        # import modules locally
        import socket
        from graph_tool import load_graph_from_csv
        from graph_tool import centrality
        import kgtk.gt.analysis_utils as gtanalysis
        import sys

        with open(filename, 'r') as f:
            header = next(f).split('\t')
            subj_index = infer_index(header, options=['node1', 'subject'])
            obj_index = infer_index(header,
                                    options=['node2', 'object', 'value'])
            predicate = infer_predicate(
                header, options=['property', 'predicate', 'label'])

            p = []
            for i, header_col in enumerate(header):
                if i in [subj_index, obj_index]: continue
                p.append(header_col)

        with open(log_file, 'w') as writer:
            writer.write('loading the TSV graph now ...\n')
            G2 = load_graph_from_csv(filename,
                                     skip_first=True,
                                     directed=directed,
                                     hashed=True,
                                     ecols=[subj_index, obj_index],
                                     eprop_names=p,
                                     csv_options={'delimiter': '\t'})

            writer.write('graph loaded! It has %d nodes and %d edges\n' %
                         (G2.num_vertices(), G2.num_edges()))
            writer.write('\n###Top relations:\n')
            for rel, freq in gtanalysis.get_topN_relations(
                    G2, pred_property=predicate):
                writer.write('%s\t%d\n' % (rel, freq))

            if output:
                writer.write('now saving the graph to %s\n' % output)
                G2.save(output)
    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Esempio n. 7
0
def run(filename, output, header_bool, sub, obj, pred, props, undirected,
        strong):
    # import modules locally
    import csv
    import sys
    from graph_tool import load_graph_from_csv
    from graph_tool.util import find_edge
    from graph_tool.topology import label_components
    from kgtk.exceptions import KGTKException
    from kgtk.cli_argparse import KGTKArgumentParser

    def find_pred_position(sub, pred, obj):
        if pred < sub and pred < obj:
            return pred
        elif (pred > sub and pred < obj) or (pred < sub and pred > obj):
            return pred - 1
        else:
            return pred - 2

    try:
        header = ['node1', 'label', 'node2']
        label = 'c' + str(find_pred_position(sub, pred, obj))
        g = load_graph_from_csv(filename,
                                not (undirected),
                                skip_first=not (header_bool),
                                hashed=True,
                                csv_options={'delimiter': '\t'},
                                ecols=(sub, obj))
        es = []
        if props:
            properties = props.split(',')
            for e in properties:
                es += (find_edge(g, g.edge_properties[label], e))
            g.clear_edges()
            g.add_edge_list(list(set(es)))
        comp, hist = label_components(g, directed=strong)
        if output:
            f = open(output, 'w')
            wr = csv.writer(f,
                            quoting=csv.QUOTE_NONE,
                            delimiter="\t",
                            escapechar="\n",
                            quotechar='')
            wr.writerow(header)
            for v, c in enumerate(comp):
                wr.writerow(
                    [g.vertex_properties['name'][v], 'connected_component', c])
            f.close()
        else:
            sys.stdout.write('%s\t%s\t%s\n' % ('node1', 'label', 'node2'))
            for v, c in enumerate(comp):
                sys.stdout.write('%s\t%s\t%s\n' %
                                 (g.vertex_properties['name'][v],
                                  'connected_component', str(c)))
    except:
        raise KGTKException
Esempio n. 8
0
def load_graph_from_edgelist(dataset, options={}):
    """"""

    edgelist, graph_gt = dataset['path_edgelist'], dataset['path_graph_gt']

    D = None

    # prefer graph_gt file
    if (not 'reconstruct_graph' in options or not options['reconstruct_graph']) and \
        (graph_gt and os.path.isfile( graph_gt )):
        log.info('Constructing DiGraph from gt.xz')
        D = load_graph(graph_gt)

    elif edgelist and os.path.isfile(edgelist):
        log.info('Constructing DiGraph from edgelist')

        if 'dict_hashed' in options and options['dict_hashed']:
            D = load_graph_from_csv(edgelist,
                                    directed=True,
                                    hashed=False,
                                    skip_first=False,
                                    csv_options={
                                        'delimiter': ' ',
                                        'quotechar': '"'
                                    })
        else:
            D = load_graph_from_csv(edgelist,
                                    directed=True,
                                    hashed=True,
                                    skip_first=False,
                                    csv_options={
                                        'delimiter': ' ',
                                        'quotechar': '"'
                                    })

        # check if graph should be dumped
        dump_graph(D, edgelist, options)
    else:
        log.error(
            'edgelist or graph_gt file to read graph from does not exist')
        return None

    return D
Esempio n. 9
0
 def load(dbsession, graph_id, graph_cache_dir):
     graph_file = graph_cache_dir.joinpath(
         JackDawDomainGraphGrapthTools.graph_file_name)
     g = JackDawDomainGraphGrapthTools(dbsession, graph_id)
     g.graph = graph_tool.load_graph_from_csv(str(graph_file),
                                              directed=True,
                                              string_vals=False,
                                              hashed=False)
     g.setup()
     logger.debug('Graph loaded to memory')
     return g
Esempio n. 10
0
def load_graph(tsv_fname, directed=True, skip_first=True, sep='\t'):
    """Load graph from a TSV edgelist file into graph-tool
    This will take a long time on large graphs (~2 hours for WoS 2018)

    :tsv_fname: path to edgelist file (TSV with header)
    :directed: if True, the graph has directed edges
    :skip_first: skip the first line of the TSV file (i.e., there is a header)
    :sep: delimiter for the TSV file (default: tab)
    :returns: graph_tool object

    """
    return graph_tool.load_graph_from_csv(tsv_fname,
                                          directed=directed,
                                          skip_first=skip_first,
                                          csv_options={'delimiter': sep})
Esempio n. 11
0
    def process(self):
        input_kr: KgtkReader = KgtkReader.open(
            self.input_file_path,
            error_file=self.error_file,
            who="input",
            options=self.input_reader_options,
            value_options=self.value_options,
            verbose=self.verbose,
            very_verbose=self.very_verbose,
        )

        input_key_columns: typing.List[int] = self.get_key_columns(
            input_kr, "input")
        label_col_idx = input_key_columns[1]
        label = '{}{}'.format('c', label_col_idx)

        g = load_graph_from_csv(str(input_kr.file_path),
                                not (self.undirected),
                                skip_first=not (self.no_header),
                                hashed=True,
                                csv_options={'delimiter': '\t'},
                                ecols=(input_key_columns[0],
                                       input_key_columns[2]))

        es = []
        header = ['node1', 'label', 'node2']
        if self.properties:
            properties = self.properties.split(',')
            for e in properties:
                es += (find_edge(g, g.edge_properties[label], e))
            g.clear_edges()
            g.add_edge_list(list(set(es)))
        comp, hist = label_components(g, directed=self.strong)

        ew: KgtkWriter = KgtkWriter.open(header,
                                         self.output_file_path,
                                         mode=input_kr.mode,
                                         require_all_columns=False,
                                         prohibit_extra_columns=True,
                                         fill_missing_columns=True,
                                         gzip_in_parallel=False,
                                         verbose=self.verbose,
                                         very_verbose=self.very_verbose)
        for v, c in enumerate(comp):
            ew.write([
                g.vertex_properties['name'][v], 'connected_component',
                str(c)
            ])
Esempio n. 12
0
def to_graph_tool(data):
    """
    convert the dataset to a graph-tool graph. 
    
    (TBD) graph_tool support weights?
    
    :param data: :py:class:`gct.Dataset`
    :rtype: graph-tool graph
    """
    import graph_tool
    fname = data.file_edges
    if not utils.file_exists(fname):
        data.to_edgelist()

    g = graph_tool.load_graph_from_csv(fname,
                                       directed=data.is_directed(),
                                       string_vals=False,
                                       skip_first=False,
                                       csv_options={"delimiter": " "})

    return g
Esempio n. 13
0
def run(input_file: KGTKFiles, directed, max_hops, source_nodes, target_nodes):
    def infer_index(h, options=[]):
        for o in options:
            if o in h:
                return h.index(o)
        return -1

    def infer_predicate(h, options=[]):
        for o in options:
            if o in h:
                return o
        return ''

    try:
        # import modules locally
        from kgtk.exceptions import KGTKException
        import socket
        from graph_tool import load_graph_from_csv
        from graph_tool import centrality
        from graph_tool.all import find_vertex
        from graph_tool.topology import all_paths
        import sys
        from collections import defaultdict

        id_col = 'name'
        graph_edge = 'graph'

        filename: Path = KGTKArgumentParser.get_input_file(input_file)
        filename = str(filename)
        with open(filename, 'r') as f:
            header = next(f).split('\t')
            subj_index = infer_index(header, options=['node1', 'subject'])
            obj_index = infer_index(header,
                                    options=['node2', 'object', 'value'])
            predicate = infer_predicate(
                header, options=['property', 'predicate', 'label'])

            p = []
            for i, header_col in enumerate(header):
                if i in [subj_index, obj_index]: continue
                p.append(header_col)

        if 'id' not in p:
            raise KGTKException('Error: no id column found')

        G = load_graph_from_csv(filename,
                                skip_first=True,
                                directed=directed,
                                hashed=True,
                                ecols=[subj_index, obj_index],
                                eprop_names=p,
                                csv_options={'delimiter': '\t'})

        graph_id = 1
        paths = defaultdict(set)
        for source_node in source_nodes:
            source_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=source_node)
            if len(source_ids) == 1:
                source_id = source_ids[0]
                for target_node in target_nodes:
                    target_ids = find_vertex(G,
                                             prop=G.properties[('v', id_col)],
                                             match=target_node)
                    if len(target_ids) == 1:
                        target_id = target_ids[0]
                        for path in all_paths(G,
                                              source_id,
                                              target_id,
                                              cutoff=max_hops,
                                              edges=True):
                            for an_edge in path:
                                edge_id = G.properties[('e', 'id')][an_edge]
                                paths[edge_id].add(str(graph_id))
                            graph_id += 1

        sys.stdout.write('node1\tlabel\tnode2\tid\t%s\n' % graph_edge)
        for e in G.edges():
            sid, oid = e
            edge_id = G.properties[('e', 'id')][e]
            lbl = G.ep[predicate][e]
            graph_id = '|'.join(list(paths[edge_id]))
            sys.stdout.write(
                '%s\t%s\t%s\t%s\t%s\n' %
                (G.vp[id_col][sid], lbl, G.vp[id_col][oid], edge_id, graph_id))

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Esempio n. 14
0
def run(filename,root,rootfile,rootfilecolumn,root_header_bool,output,header_bool,sub,obj,pred,props,undirected):
    import sys
    import csv
    import time
    from graph_tool.search import dfs_iterator
    from graph_tool import load_graph_from_csv
    from graph_tool.util import find_edge
    from kgtk.exceptions import KGTKException
    from kgtk.cli_argparse import KGTKArgumentParser


    #Graph-tool names columns that are not subject or object c0, c1... This function finds the number that graph tool assigned to the predicate column
    def find_pred_position(sub,pred,obj):
        if pred < sub and pred < obj:
            return pred
        elif (pred > sub and pred < obj) or (pred<sub and pred>obj):
            return pred-1
        else:
            return pred-2

    def get_edges_by_edge_prop(g, p, v):
        return find_edge(g, prop=g.properties[('e', p)], match=v)


    label='c'+str(find_pred_position(sub,pred,obj))
    header=['node1','label','node2']
    root_set=set()
    property_list=[]

    if (rootfile):
        tsv_file = open(rootfile)
        read_tsv = csv.reader(tsv_file, delimiter="\t")
        first_row=True
        for row in read_tsv:
            if first_row and not root_header_bool:
                    first_row=False
                    continue
            root_set.add(row[rootfilecolumn])
        tsv_file.close()
    if (root):
        for r in root.split(','):
            root_set.add(r)

    G = load_graph_from_csv(filename,not(undirected),skip_first=not(header_bool),hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj))

    name = G.vp["name"]

    index_list = []
    for v in G.vertices():
        if name[v] in root_set:
            index_list.append(v)

    edge_filter_set = set()
    if props:
        property_list = [item for item in props.split(',')]
        for prop in property_list:
            edge_filter_set.update(get_edges_by_edge_prop(G, label,prop));        
        G.clear_edges()
        G.add_edge_list(list(edge_filter_set))

    if output:
        f=open(output,'w')
        tsv_writer = csv.writer(f, quoting=csv.QUOTE_NONE,delimiter="\t",escapechar="\n",quotechar='')
        if index_list == []:
            print("No root nodes found in the graph")
        else:
            tsv_writer.writerow(header)
            for index in index_list:
                for e in dfs_iterator(G, G.vertex(index)):
                    tsv_writer.writerow([name[index], 'reachable', name[e.target()]])
        f.close()
    else:
        if index_list == []:
            print("No root nodes found in the graph")
        else:
            sys.stdout.write('%s\t%s\t%s\n' % ('node1', 'label', 'node2'))
            for index in index_list:
                for e in dfs_iterator(G, G.vertex(index)):
                    sys.stdout.write('%s\t%s\t%s\n' % (name[index], 'reachable', name[e.target()]))
import sys
import graph_tool as gt
import graph_tool.centrality as centr

def print_top_v( g, vprops ):
	names = g.vertex_properties['name'];

	vv = list( g.vertices() )
	vv.sort( key= lambda a: vprops[a], reverse=True )
	
	for v in vv:
		print( names[v] )
			
def pageRankBiDi( g ):
	# calculate the product of the PageRange and reverse PageRank for each vertex
	pr = centr.pagerank( g )

	g.set_reversed( True )
	rpr = centr.pagerank( g )
	g.set_reversed( False )

	for v in g.vertices():
		pr[v] = pr[v] * rpr[v]
		
	return pr

g = gt.load_graph_from_csv( sys.argv[1], csv_options = { 'delimiter': "\t" } )

pr = pageRankBiDi( g ) if g.num_vertices() > 0 else []
print_top_v( g, pr )
Esempio n. 16
0
args = parser.parse_args()
fname = args.f
dirname = '/'.join(fname.split('/')[:-1])
node_f_name = dirname + '/node_feature.csv'
edge_f_name = dirname + '/edge_feature.csv'

sep = args.d
directed = bool(args.directed)

print('Loaded file name: {},\tis_directed: {},\tis_weighted: {}\n'.format(
    fname, directed, bool(args.w)))

g = load_graph_from_csv(fname,
                        directed=directed,
                        csv_options={
                            'delimiter': sep,
                            'quotechar': '"'
                        })

weight = None
if bool(args.w):
    edge_weights = []
    with open(fname) as f:
        for l in f:
            edge_weight = float(l.split()[2])
            edge_weights.append(edge_weight)

    # create property for edge weights
    weight = g.new_edge_property('float')
    weight.a = edge_weights
def main(args):
    outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        logger.debug("creating output directory: {}".format(outdir))
        os.mkdir(outdir)
    else:
        logger.debug("using output directory: {}".format(outdir))

    start = timer()
    logger.debug("loading graph from {}. This will take a while...".format(
        args.edges))
    g = graph_tool.load_graph_from_csv(args.edges,
                                       directed=True,
                                       skip_first=True,
                                       csv_options={'delimiter': '\t'})
    logger.debug("done loading graph. Took {}".format(
        format_timespan(timer() - start)))

    start = timer()
    logger.debug("creating dictionary of name to vertices...")
    name_to_v = {g.vp.name[v]: v for v in g.vertices()}
    logger.debug("done loading dictionary. Took {}".format(
        format_timespan(timer() - start)))

    start = timer()
    n_samples = 1000
    random_seed = 999
    logger.debug("getting a sample of {} vertices (random seed: {})".format(
        n_samples, random_seed))
    random_state = np.random.RandomState(random_seed)
    vertices_sample_indexes = random_state.randint(low=0,
                                                   high=len(name_to_v),
                                                   size=n_samples)
    vertices_sample = [g.vertex(x) for x in vertices_sample_indexes]
    # vertices_sample is a list of graph-tools Vertex objects
    logger.debug("done getting random sample. took {}".format(
        format_timespan(timer() - start)))

    # get a unique filename
    i = 0
    while True:
        fname_calc_times = os.path.join(outdir,
                                        'calc_times_{:03}.csv'.format(i))
        if not os.path.exists(fname_calc_times):
            break
        i += 1
    f_calc_times = open(fname_calc_times, 'w', buffering=1)

    sep = ','
    logger.debug("writing header to {}".format(fname_calc_times))
    f_calc_times.write(
        "source_index{sep}source_name{sep}calc_time{sep}distance_fname\n".
        format(sep=sep))

    start = timer()
    logger.debug("starting shortest path calculations...")
    if args.undirected is True:
        logger.debug(
            "treating graph as undirected for shortest distance calculations")
        directed = False
    else:
        directed = None

    for i, source in enumerate(vertices_sample):
        this_start = timer()
        source_name = g.vp.name[source]
        source_index = vertices_sample_indexes[i]
        outfname = "{:012d}.csv".format(
            i)  # filename corresponds to row number of calc_time.csv file
        outfname = os.path.join(outdir, outfname)
        if os.path.exists(outfname):
            logger.debug(
                "filename {} already exists. skipping.".format(outfname))
            continue
        logger.debug(
            "calculating shortest distance for vertex: index: {} | name: {}".
            format(source_index, source_name))
        dist = shortest_distance(g,
                                 source=source,
                                 target=vertices_sample,
                                 directed=directed)
        this_time = timer() - this_start
        with open(outfname, 'w') as outf:
            for x in dist:
                outf.write("{}\n".format(x))
        f_calc_times.write(
            "{source_index}{sep}{source_name}{sep}{calc_time}{sep}{distance_fname}\n"
            .format(sep=sep,
                    source_index=source_index,
                    source_name=source_name,
                    calc_time=this_time,
                    distance_fname=outfname))
    logger.debug("finished shortest path calculations. Took {}".format(
        format_timespan(timer() - start)))
    f_calc_times.close()
Esempio n. 18
0
from datetime import datetime


from graph_tool.all import *
from graph_tool import *  



stage_dir = "/media/johannes/D45CF5375CF514C8/Users/johannes/mlhd/0-15/stage/"
stage_files = listdir(stage_dir)


for i in stage_files:
    print(i)
    # i = stage_files[20]
    gx = gt.load_graph_from_csv(stage_dir+i, directed=True, string_vals=True, csv_options={'delimiter':'\t'})
    rel_mbids = find_vertex_range(gx, 'in', (100, 10**19))

    # need to set limits
    # should be rather low: want to focus on new ones

    # genres are not equals

    mbid_file = stage_dir + "mbids/" + i[0:len(i)-4] + ".csv"

    with open(mbid_file, 'w') as fo:
        wr = csv.writer(fo)
        [wr.writerow([gx.vp.name[i]]) for i in rel_mbids]

## add saving of graph so that i don't have to read them in all the time
    
parser.add_argument("edgelistfilename", help="the edgelist to be parsed")
parser.add_argument("-p",
                    "--pngfilename",
                    type=str,
                    help="the output png name",
                    default="test.png")
args = parser.parse_args()

# if args.positioned:
# push the ATOM positions to a file
# with open(pdbFilename) as pdbFile:
# for line in pdbFile:
# comArray = treeFileToNumpyArray("edgelist3.tree")

g = graph_tool.load_graph_from_csv(args.edgelistfilename,
                                   directed=False,
                                   skip_first=True,
                                   csv_options={"delimiter": " "})

# pos = graph_tool.draw.sfdp_layout(g, C=1000000)
# pos = graph_tool.draw.sfdp_layout(g)
# pos = graph_tool.draw.fruchterman_reingold_layout(g, n_iter=1000)
# print(pos)
comps = graph_tool.topology.label_components(g)
# state = inference.minimize_blockmodel_dl(g, deg_corr=False)
# print(comps[1])
# print([x for x in comps])

numpy.savetxt("comps.txt", comps[1])  # draw.graph_draw(
#     g,
#     pos=pos,
#     output=args.pngfilename,
Esempio n. 20
0
File: paths.py Progetto: yyht/kgtk
def run(input_file: KGTKFiles, path_file, output_stats, directed, max_hops):
    def infer_index(h, options=[]):
        for o in options:
            if o in h:
                return h.index(o)
        return -1

    def infer_predicate(h, options=[]):
        for o in options:
            if o in h:
                return o
        return ''

    try:
        # import modules locally
        from kgtk.exceptions import KGTKException
        import socket
        from graph_tool import load_graph_from_csv
        from graph_tool import centrality
        from graph_tool.all import find_vertex
        from graph_tool.topology import all_paths
        import sys
        import csv
        from collections import defaultdict
        csv.field_size_limit(sys.maxsize)
        id_col = 'name'

        pairs = []
        with open(path_file, 'r') as f:
            header = next(f)
            for line in f:
                src, tgt = line.strip().split('\t')
                pairs.append((src, tgt))
        filename: Path = KGTKArgumentParser.get_input_file(input_file)
        with open(filename, 'r') as f:
            header = next(f).strip().split('\t')
            subj_index = infer_index(header, options=['node1', 'subject'])
            obj_index = infer_index(header,
                                    options=['node2', 'object', 'value'])
            predicate = infer_predicate(
                header, options=['property', 'predicate', 'label'])

            p = []
            for i, header_col in enumerate(header):
                if i in [subj_index, obj_index]: continue
                p.append(header_col)

        if 'id' not in p:
            raise KGTKException('Error: no id column found')
        G = load_graph_from_csv(str(filename),
                                skip_first=True,
                                directed=directed,
                                hashed=True,
                                ecols=[subj_index, obj_index],
                                eprop_names=p,
                                csv_options={'delimiter': '\t'})

        sys.stdout.write('node1\tlabel\tnode2\tid\n')
        id_count = 0
        if not output_stats:
            for e in G.edges():
                sid, oid = e
                lbl = G.ep[predicate][e]
                sys.stdout.write(
                    '%s\t%s\t%s\t%s\n' %
                    (G.vp[id_col][sid], lbl, G.vp[id_col][oid],
                     '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count)))
                id_count += 1

        id_count = 0
        path_id = 0
        paths = defaultdict(set)
        for pair in pairs:
            source_node, target_node = pair
            source_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=source_node)
            target_ids = find_vertex(G,
                                     prop=G.properties[('v', id_col)],
                                     match=target_node)
            if len(source_ids) == 1 and len(target_ids) == 1:
                source_id = source_ids[0]
                target_id = target_ids[0]
                for path in all_paths(G,
                                      source_id,
                                      target_id,
                                      cutoff=max_hops,
                                      edges=True):
                    for edge_num, an_edge in enumerate(path):
                        edge_id = G.properties[('e', 'id')][an_edge]
                        node1 = 'p%d' % path_id
                        sys.stdout.write(
                            '%s\t%d\t%s\t%s\n' %
                            (node1, edge_num, edge_id, '{}-{}-{}'.format(
                                node1, edge_num, id_count)))
                        id_count += 1
                    path_id += 1

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Esempio n. 21
0
    with open('synthetic_graphs_examined.csv', mode='w') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=Fieldnames)
        writer.writeheader()
        for prop in props:
            writer.writerow(prop._asdict())
    # Examine real world graphs
    print("=====Examining Real World Graphs=====")
    with open('real_graphs_examined.csv', mode='w') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=Fieldnames)
        writer.writeheader()
    for graphname in REAL:
        print("graphname = ", graphname)
        filename = "../../data/{0}/unkOverlap_unkBlockSizeVar/{0}_unkOverlap_unkBlockSizeVar_-1_nodes".format(
            graphname)
        if graphname in REAL_UNDIRECTED:
            graph = load_graph_from_csv(filename + ".tsv",
                                        False,
                                        csv_options={'delimiter': ' '})
            directed = False
        else:
            graph = load_graph_from_csv(filename + ".tsv",
                                        True,
                                        csv_options={'delimiter': ' '})
            directed = True
        print("done loading graph")
        prop = examine_graph(graph, "real", graphname, True, directed)
        print("done examining graph")
        with open('real_graphs_examined.csv', mode='a') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=Fieldnames)
            writer.writerow(prop._asdict())
Esempio n. 22
0


# use awk first
# awk '$1 >= 1239200000 && $1 <= 1259400000' *.txt >> read_in.txt

# requires filename as output
# awk '$1 > 5 && $1 < 20' *.txt

t3 = time.time()

proc_str = 'cd ' + daet_dir + ' && ./sorter.sh'

os.system(proc_str)

g2 = gt.load_graph_from_csv('/media/johannes/D45CF5375CF514C8/Users/johannes/mlhd/0-15/01/read_in.xxx', directed=True, string_vals=True, csv_options={'delimiter':'\t'})

t4=time.time()


subs=find_vertex_range(g2, 'in', (500, 1000000))
g2.vp.name[subs]

[print(g2.vp.name[i]) for i in subs]

##################################
# add multiple dirs

rel_dirs = ['00','01', '02', '03', '04', '05']
t1 = 1250000000
t2 = 1260000000
Esempio n. 23
0
def run(input_file: KGTKFiles, directed, compute_degrees, compute_pagerank, compute_hits, log_file, output_stats,
        vertex_in_degree, vertex_out_degree, vertex_pagerank, vertex_auth, vertex_hubs):
    from kgtk.exceptions import KGTKException
    def infer_index(h, options=[]):
        for o in options:
            if o in h:
                return h.index(o)
        return -1

    def infer_predicate(h, options=[]):
        for o in options:
            if o in h:
                return o
        return ''

    v_prop_dict = {
        'vertex_pagerank': vertex_pagerank,
        'vertex_hubs': vertex_hubs,
        'vertex_auth': vertex_auth
    }
    try:
        # import modules locally
        import socket
        from graph_tool import load_graph_from_csv
        from graph_tool import centrality
        import kgtk.gt.analysis_utils as gtanalysis
        from pathlib import Path
        import sys
        import csv
        csv.field_size_limit(sys.maxsize)

        filename: Path = KGTKArgumentParser.get_input_file(input_file)

        # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later
        directions = ['in', 'out', 'total']
        id_col = 'name'

        with open(filename, 'r') as f:
            header = next(f).split('\t')
            header=[h.strip() for h in header]
            subj_index = infer_index(header, options=['node1', 'subject'])
            obj_index = infer_index(header, options=['node2', 'object', 'value'])
            predicate = infer_predicate(header, options=['label', 'predicate', 'relation', 'relationship'])
            p = []
            for i, header_col in enumerate(header):
                if i in [subj_index, obj_index]: continue
                p.append(header_col)
        with open(log_file, 'w') as writer:

            writer.write('loading the TSV graph now ...\n')
            G2 = load_graph_from_csv(str(filename),
                                     skip_first=True,
                                     directed=directed,
                                     hashed=True,
                                     ecols=[subj_index, obj_index],
                                     eprop_names=p,
                                     csv_options={'delimiter': '\t'})

            writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges()))
            writer.write('\n###Top relations:\n')
            for rel, freq in gtanalysis.get_topN_relations(G2, pred_property=predicate):
                writer.write('%s\t%d\n' % (rel, freq))

            if compute_degrees:
                writer.write('\n###Degrees:\n')
                for direction in directions:
                    degree_data = gtanalysis.compute_node_degree_hist(G2, direction)
                    max_degree = len(degree_data) - 1
                    mean_degree, std_degree = gtanalysis.compute_avg_node_degree(G2, direction)
                    writer.write(
                        '%s degree stats: mean=%f, std=%f, max=%d\n' % (direction, mean_degree, std_degree, max_degree))

            if compute_pagerank:
                writer.write('\n###PageRank\n')
                v_pr = G2.new_vertex_property('float')
                centrality.pagerank(G2, prop=v_pr)
                G2.properties[('v', 'vertex_pagerank')] = v_pr
                writer.write('Max pageranks\n')
                result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank', 5, id_col)
                for n_id, n_label, pr in result:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr))

            if compute_hits:
                writer.write('\n###HITS\n')
                hits_eig, G2.vp['vertex_hubs'], G2.vp['vertex_auth'] = gtanalysis.compute_hits(G2)
                writer.write('HITS hubs\n')
                main_hubs = gtanalysis.get_topn_indices(G2, 'vertex_hubs', 5, id_col)
                for n_id, n_label, hubness in main_hubs:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness))
                writer.write('HITS auth\n')
                main_auth = gtanalysis.get_topn_indices(G2, 'vertex_auth', 5, id_col)
                for n_id, n_label, authority in main_auth:
                    writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority))

            sys.stdout.write('node1\tlabel\tnode2\tid\n')
            id_count = 0
            if not output_stats:
                for e in G2.edges():
                    sid, oid = e
                    lbl = G2.ep[predicate][e]
                    sys.stdout.write(
                        '%s\t%s\t%s\t%s\n' % (G2.vp[id_col][sid], lbl, G2.vp[id_col][oid],
                                              '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count)))
                    id_count += 1

            id_count = 0
            for v in G2.vertices():
                v_id = G2.vp[id_col][v]

                sys.stdout.write(
                    '{}\t{}\t{}\t{}\n'.format(v_id, vertex_in_degree, v.in_degree(),
                                              '{}-{}-{}'.format(v_id, vertex_in_degree, id_count)))
                id_count += 1
                sys.stdout.write(
                    '{}\t{}\t{}\t{}\n'.format(v_id, vertex_out_degree, v.out_degree(),
                                              '{}-{}-{}'.format(v_id, vertex_out_degree, id_count)))
                id_count += 1

                for vprop in G2.vertex_properties.keys():
                    if vprop == id_col: continue
                    sys.stdout.write(
                        '%s\t%s\t%s\t%s\n' % (v_id, v_prop_dict[vprop], G2.vp[vprop][v],
                                              '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count)))
                    id_count += 1

    except Exception as e:
        raise KGTKException('Error: ' + str(e))
Esempio n. 24
0
import graph_tool
from kgtk.gt import analysis_utils, topology_utils

#input_file="P279_sorted_by_node.csv"
input_file = "first100k_P279.csv"
direction = 'total'

G = graph_tool.load_graph_from_csv(input_file,
                                   directed=True,
                                   skip_first=True,
                                   ecols=(0, 2),
                                   csv_options={
                                       'delimiter': ',',
                                       'quotechar': '"'
                                   })

print(analysis_utils.get_num_nodes(G))

print(analysis_utils.get_num_edges(G))

print(analysis_utils.compute_stats(G, direction))
print('now computing transitive closure')
G2 = topology_utils.compute_transitive_closure(G)
print('transitive closure computed')

print(analysis_utils.get_num_nodes(G2))

print(analysis_utils.get_num_edges(G2))

print(analysis_utils.compute_stats(G2, direction))