def read_graph_labels(): rtn = {} glabel_orig_new_map = {} with open(get_data_path() + '/{}/label/label.txt'.format(conf.infolder)) as f: for line in f: ls = line.rstrip().split() assert (len(ls) == 2) gid = get_file_base_id(ls[0]) orig_glabel = int(float(ls[1])) glabel = glabel_orig_new_map.get(orig_glabel) if glabel is None: # dangerous to test not glabel since glabel could be 0! glabel = len(glabel_orig_new_map) # 0-based glabel_orig_new_map[orig_glabel] = glabel rtn[gid] = glabel return rtn
def gen_imdb_multi(): dirin = get_data_path() + '/imdb_comedy_romance_scifi/graph' k = float('inf') lesseqk = [] for file in glob(dirin + '/*.gexf'): g = nx.read_gexf(file) gid = get_file_base_id(file) print(gid, g.number_of_nodes()) if g.number_of_nodes() <= k: g.graph['gid'] = gid for node in g.nodes(): del g.node[node]['node_class'] for edge in g.edges_iter(data=True): del edge[2]['weight'] lesseqk.append(g) print(len(lesseqk)) gen_dataset(lesseqk)
def main(): dirin = get_data_path() + '/{}/graph'.format(conf.infolder) k = float('inf') lesseqk = [] glabel_map = read_graph_labels() info_map = {} disconnected = [] files = glob(dirin + '/*.gexf') if conf.need_sort_: files = sorted_nicely(files) for i, file in enumerate(files): g = nx.read_gexf(file) gid = get_file_base_id(file) print(i, gid, g.number_of_nodes()) if g.number_of_nodes() <= k: if not nx.is_connected(g): print(gid, 'is not connected') gsize = g.number_of_nodes() g = max(nx.connected_component_subgraphs(g), key=len) grmd = gsize - g.number_of_nodes() assert (grmd > 0) g_info = 'rm_{}_nodes'.format(grmd) disconnected.append(g) else: g_info = '' lesseqk.append(g) info_map[gid] = g_info g.graph['gid'] = gid g.graph['label'] = glabel_map[gid] for node, d in g.nodes(data=True): type = d['node_class'] if conf.has_node_type: d.pop('node_class') d['type'] = type for edge in g.edges_iter(data=True): del edge[2]['weight'] print(len(lesseqk)) gen_dataset(lesseqk) gen_dataset(disconnected) save_glabels_as_txt(get_data_path() + '/{}/glabels'.format(conf.outfolder), glabel_map) save_glabels_as_txt(get_data_path() + '/{}/info'.format(conf.outfolder), info_map)
def _load_emb(self, train): fn = get_result_path( ) + '/{}/emb/{}_graph2vec_{}_emb_dim_{}.npy'.format( self.dataset, self.dataset, 'train' if train else 'test', self.dim) if isfile(fn): emb = np.load(fn) print('Loaded emb {} from {}'.format(emb.shape, fn)) return emb data = load_data(self.dataset, train=train) id_map = self._gid_to_matrixid(data) emb = np.zeros((len(data.graphs), self.dim)) cnt = 0 d = self._load_json_emb() for f in d: gid = get_file_base_id(f) if gid in id_map: emb[id_map[gid]] = d[f] cnt += 1 if cnt != len(id_map): raise RuntimeError('Mismatch: {} != {}').format(cnt, len(id_map)) np.save(fn, emb) print('Saved emb {} to {}'.format(emb.shape, fn)) return emb
def get_old_aids_id(): files = glob(get_root_path() + '/data/AIDS_old/data/*.gxl') return [get_file_base_id(file) for file in files]