def read_graph_labels():
    rtn = {}
    glabel_orig_new_map = {}
    with open(get_data_path() + '/{}/label/label.txt'.format(conf.infolder)) as f:
        for line in f:
            ls = line.rstrip().split()
            assert (len(ls) == 2)
            gid = get_file_base_id(ls[0])
            orig_glabel = int(float(ls[1]))
            glabel = glabel_orig_new_map.get(orig_glabel)
            if glabel is None:  # dangerous to test not glabel since glabel could be 0!
                glabel = len(glabel_orig_new_map)  # 0-based
                glabel_orig_new_map[orig_glabel] = glabel
            rtn[gid] = glabel
    return rtn
Ejemplo n.º 2
0
def gen_imdb_multi():
    dirin = get_data_path() + '/imdb_comedy_romance_scifi/graph'
    k = float('inf')
    lesseqk = []
    for file in glob(dirin + '/*.gexf'):
        g = nx.read_gexf(file)
        gid = get_file_base_id(file)
        print(gid, g.number_of_nodes())
        if g.number_of_nodes() <= k:
            g.graph['gid'] = gid
            for node in g.nodes():
                del g.node[node]['node_class']
            for edge in g.edges_iter(data=True):
                del edge[2]['weight']
            lesseqk.append(g)
    print(len(lesseqk))
    gen_dataset(lesseqk)
def main():
    dirin = get_data_path() + '/{}/graph'.format(conf.infolder)
    k = float('inf')
    lesseqk = []
    glabel_map = read_graph_labels()
    info_map = {}
    disconnected = []
    files = glob(dirin + '/*.gexf')
    if conf.need_sort_:
        files = sorted_nicely(files)
    for i, file in enumerate(files):
        g = nx.read_gexf(file)
        gid = get_file_base_id(file)
        print(i, gid, g.number_of_nodes())
        if g.number_of_nodes() <= k:
            if not nx.is_connected(g):
                print(gid, 'is not connected')
                gsize = g.number_of_nodes()
                g = max(nx.connected_component_subgraphs(g), key=len)
                grmd = gsize - g.number_of_nodes()
                assert (grmd > 0)
                g_info = 'rm_{}_nodes'.format(grmd)
                disconnected.append(g)
            else:
                g_info = ''
                lesseqk.append(g)
            info_map[gid] = g_info
            g.graph['gid'] = gid
            g.graph['label'] = glabel_map[gid]
            for node, d in g.nodes(data=True):
                type = d['node_class']
                if conf.has_node_type:
                    d.pop('node_class')
                    d['type'] = type
            for edge in g.edges_iter(data=True):
                del edge[2]['weight']
    print(len(lesseqk))
    gen_dataset(lesseqk)
    gen_dataset(disconnected)
    save_glabels_as_txt(get_data_path() + '/{}/glabels'.format(conf.outfolder), glabel_map)
    save_glabels_as_txt(get_data_path() + '/{}/info'.format(conf.outfolder), info_map)
Ejemplo n.º 4
0
 def _load_emb(self, train):
     fn = get_result_path(
     ) + '/{}/emb/{}_graph2vec_{}_emb_dim_{}.npy'.format(
         self.dataset, self.dataset, 'train' if train else 'test', self.dim)
     if isfile(fn):
         emb = np.load(fn)
         print('Loaded emb {} from {}'.format(emb.shape, fn))
         return emb
     data = load_data(self.dataset, train=train)
     id_map = self._gid_to_matrixid(data)
     emb = np.zeros((len(data.graphs), self.dim))
     cnt = 0
     d = self._load_json_emb()
     for f in d:
         gid = get_file_base_id(f)
         if gid in id_map:
             emb[id_map[gid]] = d[f]
             cnt += 1
     if cnt != len(id_map):
         raise RuntimeError('Mismatch: {} != {}').format(cnt, len(id_map))
     np.save(fn, emb)
     print('Saved emb {} to {}'.format(emb.shape, fn))
     return emb
Ejemplo n.º 5
0
def get_old_aids_id():
    files = glob(get_root_path() + '/data/AIDS_old/data/*.gxl')
    return [get_file_base_id(file) for file in files]