Exemple #1
0
def work(num=None):
    if num == None:
        g = gt.load_graph("../data/graphAll.xml.gz")
    else:
        g = gt.load_graph("../data/graph" + str(num) + ".xml.gz")

    res = gt.stats.distance_histogram(g)  #May cost much time.

    fig = plt.figure()
    plt.plot(res[0], label="Distance distribution")
    plt.legend(loc="upper right")
    plt.xlabel("Distance")
    plt.ylabel("Count")
    if num == None:
        fig.savefig("../pic/distance.png")
    else:
        fig.savefig("../pic/distance" + str(num) + ".png")

    fig = plt.figure()
    res[0][0] = 1
    plt.plot(np.log10(res[0]), label="Log-distance distribution")
    plt.legend(loc="upper right")
    plt.xlabel("Distance")
    plt.ylabel("Log-count")

    if num == None:
        fig.savefig("../pic/log-distance.png")
    else:
        fig.savefig("../pic/log-distance" + str(num) + ".png")

    max_distance = max(res[1]) - 1
    avg_distance = np.sum(res[0] * res[1][:-1]) / np.sum(res[0])
    print 'max_distance: ' + str(max_distance)
    print 'avg_distance: ' + str(avg_distance)
Exemple #2
0
def prepare_input_graph(graphName, metric, verbose=True):
    storedFolder = roles.graph_folder(graphName)
    try:
        inGraph = IO.load_data("../Data/Graphs/" + storedFolder + "/" +
                               graphName + ".GT.graph").next()
    except:
        inGraph = gt.load_graph("../Data/Graphs/" + storedFolder + "/" +
                                graphName + ".graph.xml")

    if verbose:
        print "Loaded Input Graph.\nName = %s. \nMetric = %s. \n#Nodes = %d. #Edges = %d." % (
            graphName, metric, inGraph.num_vertices(), inGraph.num_edges())
    groupTaxa, blackList = roles.graph_node_clusters(graphName, inGraph,
                                                     metric)

    xTickMarks = roles.taxa_names(graphName)

    if verbose:
        if blackList != None:
            print "True Number of Clusters = " + str(
                len(set(groupTaxa)) - len(blackList)) + "\n"
        else:
            print "True Number of Clusters = " + str(len(
                set(groupTaxa))) + "\n"

    return inGraph, groupTaxa, blackList, xTickMarks
Exemple #3
0
def work(num):
    g = gt.load_graph("../data/graph" + str(num) + ".xml.gz")

    vp, ep = gt.centrality.betweenness(g)

    fig = plt.figure()
    res = plt.hist(vp.a, label="Betweenness of vertices", bins=100)
    plt.xlim(0, 0.04)
    plt.ylim(0, 250)
    plt.legend(loc="upper right")
    plt.xlabel("Betweenness")
    plt.ylabel("Count")
    fig.savefig('../pic/betweenness_vertices' + str(num) + '.png')
    print 'max of betweenness_vertices: ' + str(np.max(res[1]))
    print 'mean of betweenness_vertices: ' + str(np.mean(res[1]))
    print 'var of betweenness_vertices: ' + str(np.var(res[1]))

    fig = plt.figure()
    res = plt.hist(ep.a, label="Betweenness of edges", bins=100)
    plt.xlim(0, 0.015)
    plt.ylim(0, 300)
    plt.legend(loc="upper right")
    plt.xlabel("Betweenness")
    plt.ylabel("Count")
    fig.savefig('../pic/betweenness_edges' + str(num) + '.png')
    print 'max of betweenness_edges: ' + str(np.max(res[1]))
    print 'mean of betweenness_edges: ' + str(np.mean(res[1]))
    print 'var of betweenness_edgees: ' + str(np.var(res[1]))
Exemple #4
0
def metrics(file, use_cache=True):
    # use cache or recompute
    cache = os.path.splitext(file)[0] + ".json"
    if use_cache and os.path.isfile(cache):
        print('using cached metrics for', os.path.basename(file))
        with open(cache, "r") as fp:
            return json.load(fp)
    print('computing metrics for', os.path.basename(file))

    # read file
    g = load_graph(file)
    degrees = list(g.degree_property_map("out"))
    with open(file) as f:
        metalines = [next(f) for x in range(13)]

    # gather data
    metrics = {}
    metrics['file'] = os.path.basename(file)
    metrics['edges'] = int(metalines[5].split()[-1])
    metrics['rounds'] = int(metalines[1].split()[-1])
    metrics['max_degree'] = max(degrees)
    metrics['avg_degree'] = mean(degrees)
    metrics['min_degree'] = min(degrees)
    metrics['local_clustering'] = mean(local_clustering(g).get_array())
    metrics['global_clustering'] = global_clustering(g)[0]
    metrics['pseudo_diameter'] = int(pseudo_diameter(g)[0])
    fit = powerlaw.Fit(degrees, discrete=True, verbose=False)
    metrics['exponent'] = fit.alpha
    metrics['KS'] = fit.power_law.KS()
    metrics['x_min'] = fit.xmin

    with open(cache, "w") as fp:
        json.dump(metrics, fp)

    return metrics
Exemple #5
0
 def load_graph(self, value):
     stream = StringIO(value)
     try:
         g = load_graph(stream, fmt="graphml")
     except OSError as err:
         raise ValidationError("data is not correctly formatted as graphml (xml)")
     return g
Exemple #6
0
def extract_distances(fold, base_filename, name, exclude_borders=1):
    """
    Extracts distances information from a .gt file into a .csv file. By default,
    values within 1 (in units of the graph) to surface borders are excluded.

    Args:
        fold (str): path where the input is and where the output will be written
        base_filename (str): base file name for input and output files
        name (str): name of the property to extract (e.g., 'PMdistance' or
            'cERthickness')
        exclude_borders (int, optional): if > 0, triangles within this distance
            from borders and corresponding values will be excluded from the
            output files (graph .gt, surface.vtp file and .csv)

    Returns:
        None
    """
    # input graph and surface files
    gt_infile = '{}{}.gt'.format(fold, base_filename)
    # output csv, gt and vtp files
    csv_outfile = '{}{}.csv'.format(fold, base_filename)
    gt_outfile = None
    vtp_outfile = None
    if exclude_borders > 0:
        eb = "_excluding{}borders".format(exclude_borders)
        gt_outfile = '{}{}{}.gt'.format(fold, base_filename, eb)
        csv_outfile = '{}{}{}.csv'.format(fold, base_filename, eb)
        vtp_outfile = '{}{}{}.vtp'.format(fold, base_filename, eb)

    # Create TriangleGraph object and load the graph file
    tg = TriangleGraph()
    tg.graph = load_graph(gt_infile)

    _extract_distances_from_graph(tg, csv_outfile, exclude_borders, name,
                                  gt_outfile, vtp_outfile)
def findCommunities(filename):
    trials = 1
    fullFile = f'{filename}.graphml'
    print(fullFile)
    graph = gt.load_graph(fullFile)
    lowest_entropy = np.inf
    best_community = None
    for i in range(trials):
        state = inference.minimize_blockmodel_dl(graph,
                                                 deg_corr=True,
                                                 verbose=True)
        b = state.get_blocks()
        print(state.entropy())
        if state.entropy() < lowest_entropy:
            best_community = b
            lowest_entropy = state.entropy()
    communityMapping = dict()
    nodeList = list()
    communityID = list()
    for v in graph.vertices():
        nodeList.append(str(graph.vertex_properties["_graphml_vertex_id"][v]))
        communityID.append(str(best_community[v]))
    communityMapping['NODE_ID'] = nodeList
    communityMapping['COMMUNITY_ID'] = communityID
    return communityMapping
def work(num):
    g = gt.load_graph("../data/graph" + str(num) + ".xml.gz")
    state = gt.inference.minimize_blockmodel_dl(g)
    deg = g.degree_property_map("in")
    deg.a = 4 * (np.sqrt(deg.a) * 0.5 + 0.4)
    ebet = gt.centrality.betweenness(g)[1]
    ebet.a /= ebet.a.max() / 10.
    eorder = ebet.copy()
    eorder.a *= -1
    state.draw(vertex_shape=state.get_blocks(),
               output="../pic/blockmodel" + str(num) + ".pdf",
               vertex_size=deg,
               vertex_fill_color=deg,
               vorder=deg,
               edge_color=ebet,
               eorder=eorder,
               edge_pen_width=ebet)
    state.draw(vertex_shape=state.get_blocks(),
               output="../pic/blockmodel" + str(num) + ".png",
               vertex_size=deg,
               vertex_fill_color=deg,
               vorder=deg,
               edge_color=ebet,
               eorder=eorder,
               edge_pen_width=ebet)
 def _read_graph(self):
     from graph_tool import load_graph, Graph
     logger.info(f'import graphml file from {self.config.graph_path}')
     self.graph: Graph = load_graph(self.config.graph_path)
     if not self.config.directed:
         logger.info('Converting to undirected graph')
         self.graph.set_directed(False)
Exemple #10
0
def work(num):
    g = gt.load_graph("../data/graph" + str(num) + ".xml.gz")

    pos = gt.draw.sfdp_layout(g)
    gt.draw.graph_draw(g,
                       pos=pos,
                       output="../pic/sfdp_layout" + str(num) + ".pdf")
    gt.draw.graph_draw(g,
                       pos=pos,
                       output="../pic/sfdp_layout" + str(num) + ".png")

    pos = gt.draw.arf_layout(g, max_iter=0)
    gt.draw.graph_draw(g,
                       pos=pos,
                       output="../pic/arf_layout" + str(num) + ".pdf")
    gt.draw.graph_draw(g,
                       pos=pos,
                       output="../pic/arf_layout" + str(num) + ".png")

    pos = gt.draw.radial_tree_layout(g, g.vertex(0))
    gt.draw.graph_draw(g,
                       pos=pos,
                       output="../pic/radial_tree_layout" + str(num) + ".pdf")
    gt.draw.graph_draw(g,
                       pos=pos,
                       output="../pic/radial_tree_layout" + str(num) + ".png")
Exemple #11
0
    def load(cls, path=config.BUS_ROAD_GRAPH_PATH):
        """ Loads a graphtool graph
        """
        gtG = load_graph(path)

        inst = cls()
        inst.gtG = gtG
        return inst
Exemple #12
0
def load_data():
    import graph_tool as gt

    graphs = {}
    LOG.info("Loading graph data from %s", DIR)
    for fname in DIR.glob("*.gt"):
        graphs[fname.stem] = gt.load_graph(str(fname))
    return graphs
Exemple #13
0
def get_graph(arg_n=1):
    file = sys.argv[arg_n] if len(sys.argv) > arg_n else None
    if file is None:
        return False
    elif file.split('.')[-1] == 'txt':
        directed = sys.argv[arg_n + 1].lower() == "d"
        return load_graph_from_raw(file, directed)
    else:
        return GT.load_graph(file)
Exemple #14
0
def country_network(name, year, check_fields={}, n_neigh=6):
    graph_file_name = name + '_' + str(year) + '_k' + str(n_neigh) + '.xml'
    if os.path.exists(graph_file_name):
        return gt.load_graph(graph_file_name)

    if name == 'US' and check_fields == {}:
        states = ['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
                  'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
                  'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ',
                  'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD',
                  'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']
        check_fields = {'STATE': states}

    station_codes = []
    pos = []
    with open(os.path.join(folder_name, 'isd-history.csv'), 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            functional = int(row['BEGIN'][:4]) <= year <= int(row['END'][:4])
            lon = row['LON']
            lat = row['LAT']
            if row['CTRY'] == name and functional and lon and lat:
                if not all([row[field] in check_fields[field]
                            for field in check_fields]):
                    continue
                station_codes.append((row['USAF'], row['WBAN']))
                pos.append([float(lon), -float(lat)])

    station_values, missing = read_stations(station_codes, year)
    pos = np.delete(np.array(pos), missing, axis=0)

    weights = squareform(pdist(pos))
    weights = np.exp(-weights / np.median(weights))
    idx_sorted = np.argsort(weights)
    idx_sorted = idx_sorted[:, -n_neigh-1:-1]
    print idx_sorted.shape

    graph = gt.Graph(directed=False)
    graph.add_vertex(n=pos.shape[0])
    e_weights = graph.new_edge_property('double', vals=1)
    for i in range(weights.shape[0]):
        w_i = weights[i, idx_sorted[i, :]]
        for j in range(n_neigh):
            e = graph.edge(i, idx_sorted[i, j], new=True)
            e_weights[e] = w_i[j]

    v_pos = graph.new_vertex_property('vector<double>', vals=pos)
    v_values = graph.new_vertex_property('double', vals=station_values)

    graph.vertex_properties['pos'] = v_pos
    graph.vertex_properties['station_values'] = v_values
    graph.edge_properties['weights'] = e_weights

    graph.save(graph_file_name)

    return graph
Exemple #15
0
    def test_load_graph_from_file(self, here):
        """Should load the graph from the test.graphml file in tests/data/test.graphml"""
        filepath = os.path.abspath(os.path.join(here, 'data', 'test.graphml'))
        fmt = filepath.split('.')[-1].strip()
        with open(filepath, 'rb') as f:
            g = gt.load_graph(f, fmt=fmt)

            # make sure there are vertices
            vertices = list(g.vertices())
            assert len(vertices) > 200
Exemple #16
0
    def from_expe(cls, expe, corpus=None, load=True, save=True):
        if '_force_load_data' in expe:
            load = expe._force_load_data
        if '_force_save_data' in expe:
            save = expe._force_save_data

        input_path = cls.get_input_path(expe)

        data = None
        fn = cls._resolve_filename(expe)
        target_file_exists = os.path.exists(fn)

        if load is False or not target_file_exists:

            # Data loading Strategy
            if not data:
                try:
                    # Load from graph-tool Konnect repo
                    from graph_tool import collection
                    data = gt.load_graph(collection.get_data_path(expe.corpus))
                    os.makedirs(os.path.join(input_path), exist_ok=True)
                except FileNotFoundError as e:
                    pass
                except Exception as e:
                    cls.log.error("Error in loading corpus `%s': %s" %
                                  (expe.corpus, e))
                    raise e

            if not data:
                try:
                    from urllib.error import HTTPError
                    from tarfile import ReadError
                    # Load from graph-tool Konnect site
                    data = gt.collection.konect_data[expe.corpus]
                    data = cls._clean_data_konect(expe, data)
                    os.makedirs(os.path.join(input_path), exist_ok=True)
                except (HTTPError, OSError, ReadError) as e:
                    pass
                except Exception as e:
                    cls.log.error("Error in loading corpus `%s': %s" %
                                  (expe.corpus, e))
                    raise e

            if not data:
                # Load manually from file
                data = cls._extract_data_file(expe, corpus=corpus)

            if save:
                # ===== save ====
                cls._save_data(fn, data)
        else:
            # ===== load ====
            data = cls._load_data(fn)

        return cls(expe, data, corpus=corpus)
def draw_community(gml_fn,
                   output,
                   layout_name=None,
                   layout_kwargs=dict(),
                   **draw_kwargs):
    g = load_graph(gml_fn)

    # Sampel of graph g
    # g = GraphView(g, vfilt=lambda v: g.vertex_index[v]%2==0)
    g.vp['wdeg'] = g.degree_property_map('total', weight=g.ep['weight'])
    # g = GraphView(g, vfilt=lambda v: g.vp['wdeg'][v]>0)

    # label for hub account only in each community
    g.vp['clabel'] = g.new_vertex_property("string", val="")
    for c in np.nditer(np.unique(g.vp['community'].a)):
        cg = GraphView(g, vfilt=(g.vp['community'].a == c))
        v_hub = find_vertex(cg, cg.vp['wdeg'], cg.vp['wdeg'].fa.max())[0]
        cg.vp['clabel'][v_hub] = cg.vp['screenname'][v_hub]

    v_size = prop_to_size(
        g.vp['wdeg'],
        mi=MIN_V_SIZE,
        ma=MAX_V_SIZE,
        log=V_SIZE_LOG,
        power=V_SIZE_POWER)
    e_width = prop_to_size(
        g.ep['weight'],
        mi=MIN_E_WIDTH,
        ma=MAX_E_WIDTH,
        log=E_WIDTH_LOG,
        power=E_WIDTH_POWER)
    if layout_name is not None:
        try:
            pos = globals()[layout_name](g, **layout_kwargs)
        except KeyError as e:
            logger.critical('No such layout function found!')
            raise
    graph_draw(
        g,
        pos,
        output=output,
        vprops=dict(
            fill_color=g.vp['community'],
            # color='grey',
            size=v_size,
            pen_width=0.01,
            text=g.vp['clabel'],
            text_position='centered',
            font_size=8,),
        eprops=dict(
            pen_width=e_width,
            end_marker="arrow",),
        **draw_kwargs)
Exemple #18
0
 def __init__(self, infile=None, fmt='dot', outfile=None):
     if infile is not None:
         super().__init__(graph_tool.load_graph(infile, fmt))
         self.root = self.vertex(0)
     else:
         super().__init__()
         self.root = self.add_vertex()
         self.ep['label'] = self.new_edge_property('string')
     if outfile is None:
         self.outfile = sys.stdout
     else:
         self.outfile = open(outfile, 'w')
Exemple #19
0
 def load_graph(seed=None):
     if BASENAME.startswith('soc'):
         rw.read_original_graph(BASENAME, seed=seed, balanced=BALANCED)
         redensify.G = deepcopy(rw.G)
         redensify.EDGES_SIGN = deepcopy(rw.EDGE_SIGN)
     elif DATA == 'LP':
         _ = persistent.load_var(BASENAME+'.my')
         redensify.G, redensify.EDGES_SIGN = _
         return
     else:
         G = gt.load_graph(BASENAME+'.gt')
         cexp.to_python_graph(G)
Exemple #20
0
def work(num=None):
    if num == None:
        g = gt.load_graph("../data/graphAll.xml.gz")
    else:
        g = gt.load_graph("../data/graph" + str(num) + ".xml.gz")

    pr = gt.centrality.pagerank(g)

    fig = plt.figure()
    plt.hist(np.log10(pr.a), label="PageRank", bins=100)
    plt.legend(loc="upper right")
    plt.xlabel("PageRank")
    plt.ylabel("Count")
    if num == None:
        fig.savefig('../pic/pagerank.png')
    else:
        fig.savefig('../pic/pagerank' + str(num) + '.png')

    print 'max of pagerank: ' + str(np.max(pr.a.tolist()))
    print 'min of pagerank: ' + str(np.min(pr.a.tolist()))
    print 'mean of pagerank: ' + str(np.mean(pr.a.tolist()))
    print 'variance of pagerank: ' + str(np.var(pr.a.tolist()))
Exemple #21
0
def add_metrics(graph_path, metrics):
    def get_metric(ggt, metric, n_nodes, n_edges):
        if "d" == metric:
            # Density
            if n_nodes <= 1:
                value = 0.0
            else:
                value = ( 2.0 * n_edges ) / ( n_nodes * (n_nodes - 1.0) )
            ggt.gp[metric] = ggt.new_gp("float", val=value)
        elif "dg" == metric:
            # Degree
            if n_nodes <= 1:
                value = np.zeros(n_nodes, dtype=np.float32)
            else:
                value = ggt.degree_property_map('total').get_array()
            ggt.vp[metric] = ggt.new_vp("double", vals=value)
        elif "dgc" == metric:
            # Degree centrality
            if n_nodes <= 1:
                value = np.zeros(n_nodes, dtype=np.float32)
            else:
                value = ggt.degree_property_map('total').get_array() / (n_nodes - 1.0)
            ggt.vp[metric] = ggt.new_vp("double", vals=value)
        elif "cnw" == metric:
            # Clustering coefficient ( non-weighted )
            value = local_clustering(ggt).get_array()
            ggt.vp[metric] = ggt.new_vp("double", vals=value)
        elif "cw" == metric:
            # Clustering coefficient ( weighted )
            value = local_clustering(ggt, weight=ggt.ep.weight).get_array()
            ggt.vp[metric] = ggt.new_vp("double", vals=value)
        elif "pgr" == metric:
            # Page Rank
            value = pagerank(ggt).get_array()
            ggt.vp[metric] = ggt.new_vp("double", vals=value)

    ggt = gt.load_graph(str(graph_path))
    time = int(graph_path.stem.split(".")[0])
    ggt.gp.time = ggt.new_gp("int32_t", val=time)

    save_path = graph_path.parent.joinpath("../graphs_with_metrics")
    if not os.path.isdir(save_path):
        os.makedirs(save_path)

    num_edges = ggt.num_edges()
    num_nodes = ggt.num_vertices()
    for m in metrics:
        get_metric(ggt, m, num_nodes, num_edges)

    ggt.save(str(save_path.joinpath("{}.gt.xz".format(time))))
Exemple #22
0
def prepare_conceptnet(
        graph_path: Union[str, Path]) -> Tuple[Graph, Dict[str, gt.Vertex]]:
    logger.info(f"Load conceptnet graph - {str(graph_path)}")
    conceptnet_graph = gt.load_graph(str(graph_path))
    logger.info(f"Loaded conceptnet graph - {str(graph_path)}")
    remove_self_loops(conceptnet_graph)
    conceptnet_graph.reindex_edges()
    logger.info(f"Generate aspect name to vertex mapping  - {str(graph_path)}")
    vertices_conceptnet = dict(
        zip(
            conceptnet_graph.vertex_properties["aspect_name"],
            conceptnet_graph.vertices(),
        ))
    return Graph(conceptnet_graph), vertices_conceptnet
def load_graph_by_name(name, weighted=False, suffix=''):
    suffix = suffix.strip()
    if name == 'lattice':
        shape = (10, 10)
        g = lattice(shape)
    else:
        if weighted:
            path = 'data/{}/graph_weighted{}.gt'.format(name, suffix)
        else:
            path = 'data/{}/graph{}.gt'.format(name, suffix)
        print('load graph from {}'.format(path))
        g = load_graph(path)
    # assert not g.is_directed()
    return remove_filters(g)  # add shell
Exemple #24
0
def load_wiki():
    import graph_tool as gt
    import real_world as rw
    graph_file = 'wiki_simple.gt'
    ds_file = 'wiki_dst.npy'
    k = gt.load_graph(graph_file)
    dst_mat = np.load(ds_file)
    lcc = label_largest_component(k)
    k.set_vertex_filter(lcc)
    lcc_nodes = np.where(lcc.a)[0]
    rw.read_original_graph('soc-wiki.txt')
    cexp.redensify.G = rw.G
    cexp.redensify.N = len(rw.G)
    cexp.redensify.EDGES_SIGN = rw.EDGE_SIGN
    return k, lcc_nodes, dst_mat
Exemple #25
0
    def test_load_graph_from_stream(self, here):
        """Should load the graph from the test.graphml from bytes"""

        # get bytes from file
        filepath = os.path.join(here, 'data', 'test.graphml')
        fmt = filepath.split('.')[-1].strip()
        with open(filepath, 'rb') as f:
            graph_txt = f.read()

            # create stream from bytes and load graph
            stream = io.BytesIO(graph_txt)
            g = gt.load_graph(stream, fmt=fmt)

            # make sure there are vertices
            vertices = list(g.vertices())
            assert len(vertices) > 200
Exemple #26
0
def create_jsons_from_graphs(path, metrics):
    load_path = path
    save_path = path.parent.joinpath("metric_jsons")
    if not os.path.isdir(save_path):
        os.makedirs(save_path)

    graph_paths = sorted(load_path.glob("*.gt.xz"),
                         key=lambda x: int(x.stem.split(".")[0]))
    print("Process graphs to create JSONs")
    for i in tqdm(range(len(graph_paths))):
        p = graph_paths[i]
        G = gt.load_graph(str(p))
        labels = G.vp.label.get_2d_array([0]).tolist()[0]
        get_metrics(G, G.num_vertices(), G.num_edges(),
                    int(p.stem.split(".")[0]), str(save_path), metrics, labels)
    return sorted(save_path.glob("*.json"), key=lambda x: int(x.stem))
def prepare_conceptnet_graph(graph_path: str, relation_types: Set[str]):
    g = gt.load_graph(graph_path)
    remove_self_loops(g)
    g.reindex_edges()

    # filter relations
    e_hierarchical_relation_filter = g.new_edge_property("bool")
    relations = list(g.properties[("e", "relation")])
    for edge, edge_relation in tqdm(zip(g.edges(), relations),
                                    desc="Edge filtering...",
                                    total=len(relations)):
        e_hierarchical_relation_filter[edge] = edge_relation in relation_types
    g.set_edge_filter(e_hierarchical_relation_filter)

    vertices = dict(zip(g.vertex_properties["aspect_name"], g.vertices()))

    return g, vertices
Exemple #28
0
    def load(self):
        if self._mapfile[-3:] != 'shp':
            self.g = load_graph(self._mapfile)
            return

        try:
            sf = shapefile.Reader(self._mapfile)
        except Exception as e:
            print(str(e))
            return False
        roads_records = sf.shapeRecords()  # 获取路段信息'
        for road_record in roads_records:
            cross_s_index = self.add_cross(road_record.shape.points[0])
            cross_e_index = self.add_cross(road_record.shape.points[-1])
            self.add_road_edge(cross_s_index, cross_e_index, road_record)
            if int(road_record.record[self.DIRECTION_index]) == 0:  # 若路段是双向车道
                self.add_road_edge(cross_e_index, cross_s_index, road_record)
        return True
Exemple #29
0
def load_graph_from_edgelist(dataset, options={}):
    """"""

    edgelist, graph_gt = dataset['path_edgelist'], dataset['path_graph_gt']

    D = None

    # prefer graph_gt file
    if (not 'reconstruct_graph' in options or not options['reconstruct_graph']) and \
        (graph_gt and os.path.isfile( graph_gt )):
        log.info('Constructing DiGraph from gt.xz')
        D = load_graph(graph_gt)

    elif edgelist and os.path.isfile(edgelist):
        log.info('Constructing DiGraph from edgelist')

        if 'dict_hashed' in options and options['dict_hashed']:
            D = load_graph_from_csv(edgelist,
                                    directed=True,
                                    hashed=False,
                                    skip_first=False,
                                    csv_options={
                                        'delimiter': ' ',
                                        'quotechar': '"'
                                    })
        else:
            D = load_graph_from_csv(edgelist,
                                    directed=True,
                                    hashed=True,
                                    skip_first=False,
                                    csv_options={
                                        'delimiter': ' ',
                                        'quotechar': '"'
                                    })

        # check if graph should be dumped
        dump_graph(D, edgelist, options)
    else:
        log.error(
            'edgelist or graph_gt file to read graph from does not exist')
        return None

    return D
Exemple #30
0
def work(num):
    g = gt.load_graph("../data/graph" + str(num) + ".xml.gz")

    c = gt.centrality.closeness(g)

    fig = plt.figure()
    res = []
    for i in c.a.tolist():
        if not str(i) == "nan":
            res.append(i)
    plt.hist(res, label="Closeness", bins=100)
    plt.legend(loc="upper right")
    plt.xlabel("Closeness")
    plt.ylabel("Count")
    fig.savefig('../pic/closeness' + str(num) + '.png')
    print 'max of closeness: ' + str(np.max(res))
    print 'max of closeness: ' + str(np.min(res))
    print 'mean of closeness: ' + str(np.mean(res))
    print 'var of closeness: ' + str(np.var(res))
    def __init__(self, agent_generator, network_filename=None, largest_component=False, directed=False, vprop_node_id='nodeID', **kwargs):
        AgentsConnection.__init__(self, agent_generator, **kwargs)
        if network_filename is None:
            self.net = gt.Graph(directed=directed)
        else:
            try:
                self.print_f('load gt file', network_filename)
                self.net = gt.load_graph(network_filename)
                if self.net.is_directed() != directed:
                    self.net.set_directed(directed)
            except:
                self.print_f('failed. fall back to read edge list')
                self.net = read_edge_list(network_filename, directed=directed)

        if largest_component:
            self.print_f('reduce network to largest component')
            lc = gt.topology.label_largest_component(self.net)
            self.net.set_vertex_filter(lc)
            self.net.purge_vertices()

        self.print_f('create agents')
        self.agent_to_vertex = dict()
        agents_pmap = self.net.new_vertex_property('object')
        node_id_pmap = self.net.new_vertex_property('int')
        try:
            node_ids = self.net.vp[vprop_node_id]
        except KeyError:
            self.print_f('No vertex property named:', vprop_node_id)
            self.print_f('Available vertex properties:', self.net.vp.keys())
            self.print_f('Please use "vprop_node_id" param to specify the right one')
            exit()

        for agent_id, v in enumerate(self.net.vertices()):
            agent = self.agent_generator.generate_agent(node_ids[v])
            self.agent_to_vertex[agent] = v
            agents_pmap[v] = agent
            node_id_pmap[v] = int(agent)
        self.net.vp["agents"] = agents_pmap
        self.net.vp["NodeId"] = node_id_pmap
        self.print_f('setup done')
Exemple #32
0
def _library_load(filename, fmt):
    ''' Load the file using the library functions '''
    if nngt.get_config("backend") == "networkx":
        import networkx as nx

        if fmt == "graphml":
            return nx.read_graphml(filename)
        else:
            raise NotImplementedError
    elif nngt.get_config("backend") == "igraph":
        import igraph as ig

        if fmt == "graphml":
            return ig.Graph.Read_GraphML(filename)
        else:
            raise NotImplementedError
    elif nngt.get_config("backend") == "graph-tool":
        import graph_tool as gt

        return gt.load_graph(filename, fmt=fmt)
    else:
        raise NotImplementedError
Exemple #33
0
def run_minimize_blockmodel(mg, temp_loc):
    # save to temp
    nx.write_graphml(mg.g, temp_loc)
    # load into graph-tool from temp
    g = load_graph(temp_loc, fmt="graphml")
    total_degrees = g.get_total_degrees(g.get_vertices())
    remove_verts = np.where(total_degrees == 0)[0]
    g.remove_vertex(remove_verts)
    min_state = minimize_blockmodel_dl(g, verbose=False)

    blocks = list(min_state.get_blocks())
    verts = g.get_vertices()

    block_map = {}

    for v, b in zip(verts, blocks):
        cell_id = int(g.vertex_properties["_graphml_vertex_id"][v])
        block_map[cell_id] = int(b)

    block_series = pd.Series(block_map)
    block_series.name = "block_label"
    return block_series
def work(num):
    g = gt.load_graph("../data/graph" + str(num) + ".xml.gz")

    indegree_list = gt.stats.vertex_hist(g, "in")[0]
    outdegree_list = gt.stats.vertex_hist(g, "out")[0]

    fig = plt.figure()
    plt.plot(indegree_list, label="Indegree distribution")
    plt.plot(outdegree_list, label="Outdegree distribution")
    plt.xlim(0, 40)
    plt.ylim(0, 100)
    plt.legend(loc="upper right")
    plt.xlabel("Degree")
    plt.ylabel("Count")
    fig.savefig("../pic/degree" + str(num) + ".png")

    print 'max of indegree : ' + str(np.max(indegree_list))
    print 'mean of indegree : ' + str(np.mean(indegree_list))
    print 'variance of indegree : ' + str(np.var(indegree_list))

    print 'max of outdegree : ' + str(np.max(outdegree_list))
    print 'mean of outdegree : ' + str(np.mean(outdegree_list))
    print 'variance of outdegree : ' + str(np.var(outdegree_list))
Exemple #35
0
def get_graph(balanced=False):
    """Load the graph from BASENAME and optionally remove positive edges to
    balance the graph. NOTE: this only modify redensify structure and not
    graph_tool & its distance matrix"""
    if balanced:
        import persistent
    if os.path.isfile(BASENAME+'.gt'):
        g = graph_tool.load_graph(BASENAME+'.gt')
        dst_mat = np.load(BASENAME+'_dst.npy')
        cexp.to_python_graph(g)
        if balanced:
            to_delete = persistent.load_var(BASENAME+'_balance.my')
            for edge in to_delete:
                pot.delete_edge(redensify.G, edge, redensify.EDGES_SIGN)
        return g, dst_mat
    if not PA:
        cexp.random_signed_communities(2, 500, 13, 11.5/500, .0, .0)
        g = cexp.to_graph_tool()
    else:
        cexp.preferential_attachment(1000, gamma=1.4, m=12)
        cexp.turn_into_signed_graph_by_propagation(2)
        DEGREES = sorted(((node, len(adj))
                          for node, adj in cexp.redensify.G.items()),
                         key=lambda x: x[1])
        u, v = DEGREES[-1][0], DEGREES[-2][0]
        u, v = v, u if u > v else u, v
        del cexp.redensify.EDGES_SIGN[(u, v)]
        cexp.redensify.G[u].remove(v)
        cexp.redensify.G[v].remove(u)
    n = g.num_vertices()
    dst = shortest_distance(g, dense=False)
    dst_mat = np.zeros((n, n), dtype=np.uint8)
    for v in g.vertices():
        dst_mat[int(v), :] = dst[v].a.astype(np.uint8)
    g.save(BASENAME+'.gt')
    np.save(BASENAME+'_dst', dst_mat)
Exemple #36
0
			
				# Add the edge and increase the progress counter
				net.add_edge(tail, head)
				# Show an update every nth node
				count += 1
				if count % 10000 == 0:
					print(count)

		print('finished reading ' + infile)
		# Write it to disk so we don't have to do this again later
		net.save(outfile)
		print('finished saving ' + outfile)
	else:
		print('found saved ' + outfile)
		# Read the saved network
		net = gt.load_graph(outfile)
		# Since vertices as such can't be pickled, we need to reconstruct 
		#  id and id_to_gt manually
		print('finished reading ' + outfile)
		id = net.vertex_properties['id']
		id_to_gt = {id[vertex]: vertex for vertex in net.vertices()}
	
	#print(len(id_to_gt))
	print('total vertices: ' + str(net.num_vertices()))
	print('total edges: ' + str(net.num_edges()))
# 
# 	# How many samples to collect?
# 	n_samples = 1000
# 	# Initialize a container for them
# 	samples = []
# 	# And set a seed
def load_data(target_output):
    patient_file = TCGA_root_dir + '/Clinical/Biotab/nationwidechildrens.org_clinical_patient_brca.txt'
    betas_file = TCGA_root_dir + '/betas.npz'
    processed_betas_file = TCGA_root_dir + '/betas-processed.npz'
    graph_dump_file = TCGA_root_dir + '/graph.xml.gz'
    
    if (os.path.isfile(betas_file)):
        data_file = np.load(betas_file)
        betas = data_file['betas']
        col_names = data_file['col_names']
        patient_data = data_file['patient_data']
        sample_names = data_file['sample_names']
        print('fount betas_file, shape: %s' % (betas.shape.__str__()))
    else:
        patient_skipped_lines = 3
    
        patient_data = np.array(read_csv(patient_file, skip_header = False))
        patient_data = patient_data[patient_skipped_lines:,]
        sample_names = patient_data[:,0]

        data_dir = TCGA_root_dir + '/DNA_Methylation/JHU_USC__HumanMethylation450/Level_3/'
        files = os.listdir(data_dir)

        col_names = np.empty(0)
        used_samples = np.empty(0)
        unused_samples = np.empty(0)
        multiple_data_samples = np.empty(0)

        i = 0
        for name in sample_names:
            i += 1
            print('processing %3d/%3d ' %(i, len(sample_names)) + name)
            # 01 is the primary tumor sample, A is the vial A
            matched = [f for f in files if f.find(name+'-01A') > -1]
            if (len(matched) > 1):
                multiple_data_samples = np.append(multiple_data_samples, name)
                continue
            elif len(matched) == 0:
                print('no files found.')
                unused_samples = np.append(unused_samples, name)
                continue

            used_samples = np.append(used_samples, name)
            matched = matched[0]

            sample_data = np.array(read_csv(data_dir +
                                            matched, skip_header = False))
            data_skipped_lines = 2
            
            sample_col_names = sample_data[data_skipped_lines:,0]

            if col_names.shape[0] == 0:
                col_names = sample_col_names
                betas = np.empty((0,sample_col_names.shape[0]), dtype=float)
            else:
                if all(col_names == sample_col_names) == False:
                    raise RuntimeError("column names don't match")

            v = sample_data[data_skipped_lines:, 1]
            v[v == 'NA'] = -1
            v = np.array(v, dtype=float)
            v[v == -1] = np.nan
            betas = np.vstack((betas, v.reshape(1,-1)))

        indices = np.array([i for i in range(betas.shape[1])
                            if not any(np.isnan(betas[:,i]))])
        betas = betas[:,indices]
        col_names = col_names[indices]

        sample_indices = np.array([list(sample_names).index(used_samples[i])
                                   for i in range(len(used_samples))])
        patient_data = patient_data[sample_indices,:]
        np.savez(open(betas_file, 'wb'),
                 betas = betas, col_names = col_names,
                 patient_data = patient_data,
                 sample_names = sample_names)
    
    if (os.path.isfile(processed_betas_file)
        and os.path.isfile(graph_dump_file)):
        g = gt.load_graph(graph_dump_file)
        data_file = np.load(processed_betas_file)
        X = data_file['X']
        genes = data_file['genes']
        patient_data = patient_data
        print('processed data found, X: %s' % (X.shape.__str__()))
    else:
        X, g, genes = networkize_illumina450k(betas, col_names)
        print (X.__class__)
        print (genes.__class__)
        print (patient_data.__class__)
        print (g.__class__)
        np.savez(open(processed_betas_file, 'wb'),
                 X = X, genes=genes,patient_data=patient_data)
        g.save(graph_dump_file)

    if (target_output == 'ER'):
        # ER status is column index 43
        labels = patient_data[:,43]
        y = np.zeros(len(patient_data), dtype=int)
        y[labels == 'Negative'] = -1
        y[labels == 'Positive'] = 1

        final_sample_indices = (y != 0)

        y = y[final_sample_indices]
        X = X[final_sample_indices,]
        patient_data = patient_data[final_sample_indices,:]
    elif (target_output == 'stage'):
        # Stage status is column 40
        labels = patient_data[:,40]
        y = np.zeros(len(patient_data), dtype=int)
        y[labels == 'Stage I'] = -1
        y[labels == 'Stage IA'] = -1
        y[labels == 'Stage IB'] = -1
        y[labels == 'Stage II'] = -1
        y[labels == 'Stage IIA'] = -1
        y[labels == 'Stage IIB'] = -1

        y[labels == 'Stage III'] = 1
        y[labels == 'Stage IIIA'] = 1
        y[labels == 'Stage IIIB'] = 1
        
        final_sample_indices = (y != 0)

        y = y[final_sample_indices]
        X = X[final_sample_indices,]
        patient_data = patient_data[final_sample_indices,:]
    elif (target_output == 'T'):
        # T (tumor size) status is column 37
        labels = patient_data[:,37]
        y = np.zeros(len(patient_data), dtype=int)
        y[labels == 'T1'] = -1
        y[labels == 'T1a'] = -1
        y[labels == 'T1b'] = -1
        y[labels == 'T1c'] = -1
        y[labels == 'T2'] = -1
        y[labels == 'T2a'] = -1
        y[labels == 'T2b'] = -1

        y[labels == 'T3'] = 1
        y[labels == 'T3a'] = 1
        y[labels == 'T3b'] = 1
        y[labels == 'T4'] = 1
        y[labels == 'T4a'] = 1
        y[labels == 'T4b'] = 1
        y[labels == 'T4c'] = 1
        y[labels == 'T4d'] = 1
        
        final_sample_indices = (y != 0)

        y = y[final_sample_indices]
        X = X[final_sample_indices,]
        patient_data = patient_data[final_sample_indices,:]
    elif (target_output == 'N'):
        # N (lymph nodes) status is column 38
        labels = patient_data[:,38]
        y = np.zeros(len(patient_data), dtype=int)
        N0s = np.array([labels[i].startswith("N0") for i in range(len(labels))])
        y[N0s] = -1

        N1s = np.array([labels[i].startswith("N1") for i in range(len(labels))])
        y[N1s] = 1
        N2s = np.array([labels[i].startswith("N2") for i in range(len(labels))])
        y[N1s] = 1
        N3s = np.array([labels[i].startswith("N3") for i in range(len(labels))])
        y[N1s] = 1
        
        final_sample_indices = (y != 0)

        y = y[final_sample_indices]
        X = X[final_sample_indices,]
        patient_data = patient_data[final_sample_indices,:]
    else:
        raise RuntimeError("target_output not in ('ER', 'T', 'N', 'stage')")

    return (X, y, g, patient_data, genes)
    
Exemple #38
0
    import random as r
    import args_experiments as ae
    import convert_experiment as cexp
    import real_world as rw
    import redensify
    parser = ae.get_parser('Compute a galaxy tree')
    args = parser.parse_args()
    a = ae.further_parsing(args)
    basename, seeds, synthetic_data, prefix, noise, balanced = a

    if synthetic_data:
        try:
            ae.load_raw(basename, redensify, args)
        except IOError:
            import graph_tool as gt
            g = gt.load_graph(basename+'.gt')
            cexp.to_python_graph(g)
    else:
        rw.read_original_graph(basename, seed=args.seed, balanced=balanced)
        redensify.G = deepcopy(rw.G)
        redensify.EDGES_SIGN = deepcopy(rw.EDGE_SIGN)

    suffixes = ('_bal' if args.balanced else '',
                '_short' if args.short else '',
                '_safe' if args.safe else '', args.seed)
    outname = 'lp10/{}{}{}{}_{}'.format(args.data.lower(), *suffixes)
    print(outname)
    res = meta_galaxy(redensify.G, redensify.EDGES_SIGN, 10, outname,
                      safe=args.safe, short=args.short)
    if args.safe:
        gold, pred, _ = res
Exemple #39
0
	def loadGraph(self, path):
		G = graph_tool.load_graph(path, fmt="gml")
		return G
def main(argv=None):
    # parse arguments (not used yet)
    if argv is None:
        argv = sys.argv
    # load parameters
    f = open(paramFn, 'r')
    my_params = pickle.load(f)
    f.close()
    # load graph topology
    g = gt.load_graph(graphFn)
    g.reindex_edges()
    num_vertices = g.num_vertices()
    num_edges = g.num_edges()
    # store vertex types
    vertex_types = np.array( g.vertex_properties["type"].get_array(), 
                             dtype=np.int )
    # construct an edge list
    edge_list = np.zeros( (num_edges, 3) )
    # also a lookup table for in-edges
    # this requires a degree list
    in_degrees = np.array( g.degree_property_map("in").get_array(),
                           dtype=np.int )
    max_degree = np.max( in_degrees )
    if num_edges > 0:
        # "ragged" array of in-edges
        in_edges = np.zeros( (num_vertices, max_degree), dtype=np.int )
        gsyn_props = g.edge_properties["gsyn"]
    else:
        in_edges = np.zeros( (num_vertices, max_degree), dtype=np.int )
        gsyn_props = []
    # for looping
    in_edge_ct = np.zeros( (num_vertices,), dtype=np.int )
    i = 0
    for e in g.edges():
        source_index = int( e.source() )
        target_index = int( e.target() )
        edge_list[i,...] = [source_index, 
                            target_index,
                            gsyn_props[e]]
        in_edges[ target_index, in_edge_ct[target_index] ] = i
        # increment indices
        in_edge_ct[ target_index ] += 1
        i += 1
    ## setup initial conditions
    # state will contain vertex variables & edge
    # variables in a 1d array
    N = num_vertices*num_eqns_per_vertex +\
        num_edges*num_eqns_per_edge
    # state vector y encodes vertex and edge data
    y = np.zeros(N)
    for i in range( num_vertices ):
        # vertex data in 0:num_eqns_per_vertex*num_vertices-1
        j = range(i*num_eqns_per_vertex, (i+1)*num_eqns_per_vertex)
        #print(j)
        y[j] = [
            -0.026185387764343,
             0.318012107836673,
             0.760361103277830,
             0.681987892188221,
             0.025686471226045,
             0.050058183820371,
             4.998888741335261
             ]
    offset = num_vertices*num_eqns_per_vertex
    for i in range( num_edges ):
        j = range(offset + i*num_eqns_per_edge,
                  offset + (i+1)*num_eqns_per_edge)
        #print(j)
        y[j] = 0.000001090946631
    #print(N)
    print y
    
    # f is the rhs with parameters evaluated
    def f(t, y):
        dydt = prebotc.rhs(t, y, 
                           vertex_types,
                           edge_list, 
                           in_edge_ct,
                           in_edges,
                           my_params)
        return dydt
    
    # output vector of states
    save_state = np.zeros( (N, Nstep) ) 

    ## hard-coded Euler method
    t = t0;
    for i in range(Nstep):
        dydt = f(t, y)
        y = y + dydt * dt # fwd Euler
        #save_state[:, i] = y[ 0:(num_vertices*num_eqns_per_vertex):num_eqns_per_vertex ] # just voltages
        save_state[:, i] = y; # all vars
        t = t + dt;
        if ( (i+1)%report_every ) == 0:
            print t
            
    scipy.io.savemat(outFn, mdict={'Y': save_state},
                     oned_as = 'col')
def poisson_disk_sample(left_hemisphere, right_hemisphere, num_regions, wmm=None, output_filename='pds_results.pickle', create_niftii=True, output_basename='randomLabels-'):
    
    # if left and right hemisphere is given as a string (filename), load
    if ((not isinstance(left_hemisphere, str)) or (not isinstance(right_hemisphere, str))):
        raise TypeError('Input error: poisson_disk_sampling(...) expects the first two arguments to be filenames.')
    
    # check if num regions is a list
    if (not isinstance(num_regions, list)):
        raise TypeError('Input error: poisson_disk_sampling(...) expects number of regions to be a list.')
    
    # ensure output_filename ends with pickle
    if ((not (output_filename[-7::]=='.pickle'))|(not isinstance(output_filename, str))):
        raise TypeError('Input error: poisson_disksampling(...) expects output name (string) ending in ".pickle".')
    
    for which, each in enumerate(num_regions):
        if ((each%2)!=0):
            print('Input error: Number of regions uneven. Fixing it by subtracting 1...')
            num_regions[which] = each-1
        
    # set graph file names
    graph_file_left = 'graph_skel_left.xml.gz'
    graph_file_right = 'graph_skel_right.xml.gz'

    # prepare imaging data for generating voxel level graphs
    print('Preparing data...')
    imgData_left, matrix_coords_left = misc.prepare_data(left_hemisphere,wmm=wmm)
    imgData_right, matrix_coords_right = misc.prepare_data(right_hemisphere,wmm=wmm)

    # if graph files do not exist, calculate and save, else load
    if ((not os.path.isfile(graph_file_left)) & (not os.path.isfile(graph_file_right))):        
        # left hemisphere
        graph_left = generate_graph(matrix_coords_left, imgData_left.copy())
        graph_left.save(graph_file_left)
        
        #right hemisphere
        graph_right = generate_graph(matrix_coords_right, imgData_right.copy())
        graph_right.save(graph_file_right)
            
    else:
        graph_left = gt.load_graph(graph_file_left)
        graph_right = gt.load_graph(graph_file_right)
    
    print('Finding region centres...')
    if os.path.isfile(output_filename):
        with open(output_filename,'rb') as outputfile:
            [regions, thresholds_left, thresholds_right, ignore] = pickle.load(outputfile)
    else:
        regions = []
        thresholds_left = []
        thresholds_right = []
        
    # for each entry in the num_regions list
    for idx, each in enumerate(num_regions):
        print('Parcellating for %04d regions' %each)
        # find parcellations in both hemispheres
        thresh_left, regions_left = parcellate_hemisphere(graph_left, matrix_coords_left, int(each/2.))
        # right hemisphere takes in distance estimate from left hemisphere, as a starting point
        thresh_right, regions_right = parcellate_hemisphere(graph_right, matrix_coords_right, int(each/2.), thresh_left)

        # restructure to save
        regions.append([regions_left[0], regions_right[0]])
        thresholds_left.append(thresh_left)
        thresholds_right.append(thresh_right)

        print('Saving progress...')
        # save results to file
        with open(output_filename,'wb') as outputfile:
            pickle.dump([regions, thresholds_left, thresholds_right, num_regions], outputfile)
    
        if create_niftii:
            print('Creating niftii files...')
            create_parcellation(output_filename, [each], left_hemisphere, right_hemisphere, output_basename)
Exemple #42
0
 def load_graphtool(path, fmt):
     gt = _import_graphtool()
     graph = gt.load_graph(path, fmt=fmt)
     return cls.from_graphtool(graph)
def g():
    return load_graph('data/{}/2-6/graph.gt'.format('grid'))
def load_data(input_dir,
              target_labels, sample_type=None, patient_annot_file=None,
              final_dump_folder = None, networkize_data = False):
    if (sample_type == None):
        print("sample type must be given. For example 01A (as suffix to patient codes.)")
        return
    
    dump_dir = input_dir + '/processed'
    if (not os.path.exists(dump_dir)):
        os.mkdir(dump_dir)

    if (patient_annot_file == None):
        patient_file_candidates = glob.glob(input_dir + '/Clinical/Biotab/nationwidechildrens.org_clinical_patient*.txt')
        if (len(patient_file_candidates) != 1):
            print('ERROR: patient_file_candidates: ', patient_file_candidates)
            return(None)
        patient_annot_file = patient_file_candidates[0]

    patient_annot_processed_file = dump_dir + '/patient_annot.npz'
    betas_file = dump_dir + '/betas.npz'
    processed_betas_file = dump_dir + '/betas-processed.npz'
    gene_annot_file = dump_dir + '/genes.npz'
    graph_dump_file = dump_dir + '/graph.xml.gz'
    calculated_L_matrix = dump_dir + '/L.npz'

    '''
        here we load the annotation and batch information of the samples
    '''
    if (os.path.isfile(patient_annot_processed_file)):
        data_file = np.load(patient_annot_processed_file)
        patient_annot = data_file['patient_annot']
        patient_annot_colnames = data_file['patient_annot_colnames']
        patient_codes = data_file['patient_codes']
    else:
        patient_skipped_lines = 3
    
        patient_data = np.array(read_csv(patient_annot_file, skip_header = False))
        patient_annot_colnames = patient_data[0,:]
        patient_annot = patient_data[patient_skipped_lines:,]
        patient_codes = patient_data[patient_skipped_lines:,0]

        xml_dir = input_dir + '/Clinical/XML'

        '''
        here I look for the admin:batch_number key in xml files of the patients,
        extract that line, remove extra stuff with sed, and get a two column text
        with patient ids and batch numbers.
        '''
        output = subprocess.check_output("grep \"admin:batch_number xsd_ver=\" %s/*_clinical*.xml | awk '{print $1 \"\t\" $3}' | sed \"s/.*clinical\.//g\" | sed \"s/\.xml:\t.*\\\">/\t/g\" | sed \"s/\..*//g\"" % (xml_dir),
                                         shell=True,
                                         universal_newlines=True).splitlines()
        patient_batches_dict = {output[i].split('\t')[0]:output[i].split('\t')[1]
                                for i in range(len(output))}

        patient_batches = np.zeros(len(patient_codes), dtype=int)
        for i in range(len(patient_codes)):
            patient_batches[i] = patient_batches_dict[patient_codes[i]]
        patient_annot = np.hstack((patient_annot, patient_batches.reshape(-1,1)))
        patient_annot_colnames = np.append(patient_annot_colnames, 'batch_number')
        
        np.savez(open(patient_annot_processed_file, 'wb'),
                 patient_annot = patient_annot,
                 patient_annot_colnames = patient_annot_colnames,
                 patient_codes = patient_codes)
            

    '''
        in this section the methylation beta values are extracted and put into
        a matrix loaded from 450k illumina chip.
    '''
    if (os.path.isfile(betas_file)):
        data_file = np.load(betas_file)
        betas = data_file['betas']
        col_names = data_file['col_names']
        sample_indices = data_file['methylation_45k_sample_indices']
        print('fount betas_file, shape: %s' % (betas.shape.__str__()))
    else:
        data_dir = input_dir + '/DNA_Methylation/JHU_USC__HumanMethylation450/Level_3/'
        if (os.path.exists(data_dir)):
            sample_indices, col_names, betas, debug_info = \
                load_450k_methylation(data_dir, patient_codes, sample_type)
            print(debug_info)
                
            np.savez(open(betas_file, 'wb'),
                 betas = betas, col_names = col_names,
                 methylation_45k_sample_indices = sample_indices)


    """
    Don't use the PPI network if no network is needed, and return raw
    beta values.
    """
    if not networkize_data:
        processed_data = dump_by_target_label(betas, target_labels, patient_annot,
                        patient_annot_colnames, sample_indices, None,
                        dump_dir)

        return (processed_data, None, col_names)
    
    '''
        use the graph to map nodes to genes and get the graph itself.
    '''
    if (os.path.isfile(processed_betas_file)
        and os.path.isfile(graph_dump_file)
        and os.path.isfile(gene_annot_file)):
        g = gt.load_graph(graph_dump_file)
        data_file = np.load(processed_betas_file)
        X = data_file['X']
        data_file = np.load(gene_annot_file)
        genes = data_file['genes']
        print('processed data found, X: %s' % (X.shape.__str__()))
    else:
        X, g, genes = networkize_illumina450k(betas, col_names)
        print (X.__class__)
        print (genes.__class__)
        print (g.__class__)
        np.savez(open(processed_betas_file, 'wb'), X = X)
        np.savez(open(gene_annot_file, 'wb'), genes=genes)
        g.save(graph_dump_file)


    if (os.path.isfile(calculated_L_matrix)):
        data_file = np.load(calculated_L_matrix)
        L = data_file['L']
        print('fount L matrix, shape: %s' % (L.shape.__str__()))
    else:
        print("calculating L and transformation of the data...")
        B = gt.spectral.laplacian(g)
        M = np.identity(B.shape[0]) + Globals.beta * B
        M_inv = np.linalg.inv(M)
        L = np.linalg.cholesky(M_inv)
        np.savez(open(calculated_L_matrix, 'wb'),
                 L = L)
        
    if (final_dump_folder != None):
        dump_dir = final_dump_folder
        
    processed_data = dump_by_target_label(X, target_labels, patient_annot,
                        patient_annot_colnames, sample_indices, L,
                        dump_dir)

    return (processed_data, g, genes)
Exemple #45
0
        if (sys.argv[i] == '--cv-index'):
            cv_index = int(sys.argv[i + 1]) - 1
        if (sys.argv[i] == '--regularizer-index'):
            regularizer_index = int(sys.argv[i + 1])

    print(working_dir, method, cv_index, regularizer_index, file=sys.stderr)

    print("loading data...", file=sys.stderr)

    data_file = np.load(working_dir + '/npdata.npz')
    tmpX = data_file['tmpX']
    X_prime = data_file['X_prime']
    y = data_file['y']
    sample_annotation = data_file['sample_annotation']
    feature_annotation = data_file['feature_annotation']
    g = gt.load_graph(working_dir + '/graph.xml.gz')
    cvs = pickle.load(open(working_dir + '/cvs.dmp', 'rb'))

    #choosing only one cross-validation fold
    tmp = list()
    tmp.append((cvs[cv_index]))
    cvs = tmp
    
    cpu_count = 1
    max_learner_count = 3
    rat_scores = dict()
    all_scores = defaultdict(list)

    if (method == 'others'):

        machine = svm.NuSVC(nu=0.25,
Exemple #46
0
import graph_tool as gt
import graph_tool.community as gtcomm
#import graph_tool.draw as gtdraw
import graph_tool.topology as gtopo

from matplotlib.cm import OrRd_r, OrRd

from analyze_net import layout_and_plot

net = gt.load_graph('autnet0.out.gt')
core = net.vp['core']
core_vertices = [vertex for vertex in net.vertices() if core[vertex]]

print('total v: ' + str(net.num_vertices()))
print('total core: ' + str(len([vertex for vertex in net.vertices() if core[vertex]])))

# For citenet
# cutoff = 2013
# cutoff_pmap = net.new_vp('bool', vals = [net.vp['year'][vertex] > cutoff for vertex in net.vertices()])
# net.set_vertex_filter(cutoff_pmap)
# core_vertices = [vertex for vertex in net.vertices() if core[vertex]]
# print(cutoff)

# For autnets
cutoff = 0
cutoff_pmap = core.copy()
for i in range(cutoff):
	gt.infect_vertex_property(net, cutoff_pmap, vals = [True])
net.set_vertex_filter(cutoff_pmap)
core_vertices = [vertex for vertex in net.vertices() if core[vertex]]
print('cutoff v: ' + str(net.num_vertices()))
import pickle
import numpy as np
import core.FCE
import core.raccoon
import graph_tool as gt

if __name__ == '__main__':
    input_dir = '../data/TCGA-SARC/vital_status'
    data_file = np.load(input_dir + '/data.npz')
    X = data_file['X']
    X_prime = data_file['X_prime']
    y = data_file['y']
    sample_annotation = data_file['patient_annot']
    data_file = np.load(input_dir + '/../genes.npz')
    feature_annotation = data_file['genes']
    g = gt.load_graph(input_dir + '/../graph.xml.gz')
    cvs = pickle.load(open(input_dir + '/batch_cvs.dmp', 'rb'))

    # choosing only one cross-validation fold
    cvs_results = list()
    for cv_index in range(len(cvs)):
    #for cv_index in [3]:
        tmp = list()
        tmp.append((cvs[cv_index]))
        tcvs = tmp

        Xtrain = X[tcvs[0][0], ]
        ytrain = y[tcvs[0][0], ]
        Xtest = X[tcvs[0][1], ]
        ytest = y[tcvs[0][1], ]
def open_and_apply_filters(filename):
    G = gt.load_graph(filename)
    G.set_vertex_filter(G.vertex_properties['in_USPTO_tree'])
    return G
def load_data(target_output):
    patient_file = TCGA_root_dir + '/Clinical/Biotab/nationwidechildrens.org_clinical_patient_laml.txt'
    expressions_file = TCGA_root_dir + '/expressions.npz'
    processed_expressions_file = TCGA_root_dir + '/expressions-processed.npz'
    graph_dump_file = TCGA_root_dir + '/graph-geneexpression.xml.gz'
    
    if (os.path.isfile(expressions_file)):
        data_file = np.load(expressions_file)
        expressions = data_file['expressions']
        col_names = data_file['col_names']
        patient_data = data_file['patient_data']
        sample_names = data_file['sample_names']
        print('fount expressions_file, shape: %s' % (expressions.shape.__str__()))
    else:
        patient_skipped_lines = 3
    
        patient_data = np.array(read_csv(patient_file, skip_header = False))
        patient_data = patient_data[patient_skipped_lines:,]
        sample_names = patient_data[:,0]

        data_dir = TCGA_root_dir + '/Expression-Genes/WUSM__HG-U133_Plus_2/Level_3/'
        files = os.listdir(data_dir)

        col_names = np.empty(0)
        used_samples = np.empty(0)
        unused_samples = np.empty(0)
        multiple_data_samples = np.empty(0)

        i = 0
        for name in sample_names:
            i += 1
            print('processing %3d/%3d ' %(i, len(sample_names)) + name)
            # 03A : Primary Blood Derived Cancer - Peripheral Blood
            matched = [f for f in files if f.find(name+'-03A') > -1]
            if (len(matched) > 1):
                multiple_data_samples = np.append(multiple_data_samples, name)
                continue
            elif len(matched) == 0:
                print('no files found.')
                unused_samples = np.append(unused_samples, name)
                continue

            used_samples = np.append(used_samples, name)
            matched = matched[0]

            sample_data = np.array(read_csv(data_dir +
                                            matched, skip_header = False))
            data_skipped_lines = 2
            
            sample_col_names = sample_data[data_skipped_lines:,0]

            if col_names.shape[0] == 0:
                col_names = sample_col_names
                expressions = np.empty((0,sample_col_names.shape[0]), dtype=float)
            else:
                if all(col_names == sample_col_names) == False:
                    raise RuntimeError("column names don't match")

            v = sample_data[data_skipped_lines:, 1]
            v[v == 'NA'] = -1
            v = np.array(v, dtype=float)
            v[v == -1] = np.nan
            expressions = np.vstack((expressions, v.reshape(1,-1)))

        indices = np.array([i for i in range(expressions.shape[1])
                            if not any(np.isnan(expressions[:,i]))])
        expressions = expressions[:,indices]
        col_names = col_names[indices]

        sample_indices = np.array([list(sample_names).index(used_samples[i])
                                   for i in range(len(used_samples))])
        patient_data = patient_data[sample_indices,:]
        np.savez(open(expressions_file, 'wb'),
                 expressions = expressions, col_names = col_names,
                 patient_data = patient_data,
                 sample_names = sample_names)
    
    if (os.path.isfile(processed_expressions_file)
        and os.path.isfile(graph_dump_file)):
        g = gt.load_graph(graph_dump_file)
        data_file = np.load(processed_expressions_file)
        X = data_file['X']
        genes = data_file['genes']
        patient_data = patient_data
        print('processed data found, X: %s' % (X.shape.__str__()))
    else:
        X, g, genes = networkize_illuminaU133(expressions, col_names)
        print (X.__class__)
        print (genes.__class__)
        print (patient_data.__class__)
        print (g.__class__)
        np.savez(open(processed_expressions_file, 'wb'),
                 X = X, genes=genes,patient_data=patient_data)
        g.save(graph_dump_file)

    if (target_output == 'risk_group'):
        # cyto_risk_group status is column index 50
        labels = patient_data[:,50]
        y = np.zeros(len(patient_data), dtype=int)
        y[labels == 'Favorable'] = -1
        y[labels == 'Intermediate/Normal'] = 1
        y[labels == 'Poor'] = 1
        
        final_sample_indices = (y != 0)

        y = y[final_sample_indices]
        X = X[final_sample_indices,]
        patient_data = patient_data[final_sample_indices,:]
    elif (target_output == 'vital_status'):
        # vital_status status is column index 15
        labels = patient_data[:,15]
        y = np.zeros(len(patient_data), dtype=int)
        y[labels == 'Alive'] = -1
        y[labels == 'Dead'] = 1
        
        final_sample_indices = (y != 0)

        y = y[final_sample_indices]
        X = X[final_sample_indices,]
        patient_data = patient_data[final_sample_indices,:]
    else:
        raise RuntimeError("target_output not in ('risk_group', 'vital_status')")

    return (X, y, g, patient_data, genes)
Exemple #50
0
    ds_file = 'epi_graph_dst.npy'
    orig_file = 'soc-sign-epinions.txt'
    prefix = 'epi'
    size = 131580

n = size
idx = int(sys.argv[1])

def print_diag(msg):
    global start, idx
    info = '{}{:.2f} seconds\n'.format
    with open('{}_out.{}'.format(prefix, idx), 'a') as f:
        f.write(info(msg.ljust(60), clock() - start))
    start = clock()

k = gt.load_graph(graph_file)
dst_mat = np.load(ds_file)
lcc = label_largest_component(k)
k.set_vertex_filter(lcc)
lcc_nodes = np.where(lcc.a)[0]
slcc = set(lcc_nodes)
all_lcc_edges = {(int(u), int(v)) for u, v in k.edges() if int(u) in slcc}
rw.read_original_graph(orig_file)
high_degree = [_[0] for _ in rw.DEGREES[-200:][::-1]]
for e, s in rw.EDGE_SIGN.items():
    rw.EDGE_SIGN[e] = 1 if s else -1
print_diag('load graph')
root = high_degree[idx]
bfs_tree = set(pot.get_bfs_tree(rw.G, root))
test_edges = all_lcc_edges - bfs_tree
test_graph = {}
Exemple #51
0
import graph_tool as gt
import sys

# Specify the file to read from command line
if len(sys.argv) == 2 :
  filename = sys.argv[1]
else :
  print " Usage : python reader.py graph.gml"
  exit(0)

# let's load the graph and store the vertex properties in variable name.
G = gt.load_graph(filename, "xml")
names = G.vertex_properties["name"]

# iterate over vertices of graph
for i in G.vertices() :
  if names[i] == "&" :
    print " + and Found"
  elif names[i] == "|" :
    print " + or found "
  elif names[i] == "!" :
    print " + not found " 
  else :
    print " + Input node found with name {0}".format(names[i])


Exemple #52
0
                                 attributes=EdgeAttributes)
    EdgeAttributes = {}
    for Edge,Value in networkx.get_edge_attributes(G=CompleteGraph,
                                                   name='TraitGeneEdgeValue').iteritems():
        try:
            EdgeAttributes[Edge] = float(Value)
        except:
            EdgeAttributes[Edge] = -999.0
    networkx.set_edge_attributes(G=CompleteGraph,
                                 name='TraitGeneEdgeValue',
                                 attributes=EdgeAttributes)
    networkx.write_graphml(G=CompleteGraph,
                           path='Data/CompleteGraphTypeCasted.graphml',
                           encoding='utf-8',
                           prettyprint=True)

    NodeDeleteList = []
    for Node in CompleteGraph.nodes():
        if((CompleteGraph.node[Node]['NodeInENGAGEMA']==0) and
           (CompleteGraph.node[Node]['NodeInENGAGE']==0)):
            NodeDeleteList.append(Node)
    for Node in NodeDeleteList:
        CompleteGraph.remove_node(Node)
    networkx.write_graphml(G=CompleteGraph,
                           path='Data/InENGAGEGraph.graphml',
                           encoding='utf-8',
                           prettyprint=True)
    InENGAGEMAGraph = graph_tool.load_graph(file_name='Data/InENGAGEGraph.graphml',fmt='xml')
    print InENGAGEMAGraph

def create_parcellation(input_filename, num_regions, left_hemisphere, right_hemisphere, output_basename='randomLabels-'):
    # load data    
    with open(input_filename,'rb') as input_file:
        [regions, thresholds_left, thresholds_right, ignore] = pickle.load(input_file)
        
        # find number of regions that exist
        num_regions_list = []
        for each_entry in regions:
            if ((each_entry[0] is None) or (each_entry[1] is None)):
                num_regions_list.append(0)
            else:
                num_regions_list.append(len(each_entry[0])+len(each_entry[1]))
    
        # prepare imaging data for generating voxel level graphs
        imgData_left, matrix_coords_left = misc.prepare_data(left_hemisphere)
        imgData_right, matrix_coords_right = misc.prepare_data(right_hemisphere)
    
        # define graph file names
        graph_file_left = 'graph_voxel_left.xml.gz'
        graph_file_right = 'graph_voxel_right.xml.gz'
    
        # if graph files do not exist, calculate and save, else load
        if ((not os.path.isfile(graph_file_left)) & (not os.path.isfile(graph_file_right))):
            print('- Creating voxel-level graphs...')
            graph_left = generate_graph(matrix_coords_left, imgData_left.copy())
            graph_left.save(graph_file_left)
            
            graph_right = generate_graph(matrix_coords_right, imgData_right.copy())
            graph_right.save(graph_file_right)
                
        else:
            graph_left = gt.load_graph(graph_file_left)
            graph_right = gt.load_graph(graph_file_right)
    
        
        for number_of_regions in np.unique(num_regions):
            # find index for the number of regions
            idx_list = np.where(np.asarray(num_regions_list)==number_of_regions)[0]
    
            # check if this number of regions exist and assign indices to populate hemispheres
            if (len(idx_list) == 0):
                print('Number of regions not found. Try filling the gaps or choose different number of regions.')
            else:
                for count, idx in enumerate(idx_list):
                    img = nib.load(left_hemisphere)
                    print('- Populating image...')
                    ## populate image
                    file_template = output_basename + '%03d' % number_of_regions + '_%03d' % (count+1) + '.nii.gz'
                    # left hemisphere
                    left_random_label = populate_mask(graph_left, regions[idx][0], imgData_left.copy(), matrix_coords_left)
                    outimg = nib.Nifti1Image(left_random_label, header=img.get_header(), affine=img.get_affine())
                    outimg.to_filename('left_' + file_template)
                    
                    # right hemisphere
                    right_random_label = populate_mask(graph_right, regions[idx][1], imgData_right.copy(), matrix_coords_right, offset=left_random_label.max())
                    outimg = nib.Nifti1Image(right_random_label, header=img.get_header(), affine=img.get_affine())
                    outimg.to_filename('right_' + file_template)
            
                    # combined image
                    random_label = left_random_label + right_random_label
                    outimg = nib.Nifti1Image(random_label, header=img.get_header(), affine=img.get_affine())
                    outimg.to_filename(file_template)