class _MNA(object): def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) if not isinstance(graph, dict): self.logger.error('The graph must contain src and target graphs.') return self.L = load_train_valid_labels(anchorfile, valid_prop) self.graph = graph self.look_up = dict() self.look_up['f'] = self.graph['f'].look_up_dict self.look_up['g'] = self.graph['g'].look_up_dict self.look_back = dict() self.look_back['f'] = self.graph['f'].look_back_list self.look_back['g'] = self.graph['g'].look_back_list self.neg_ratio = neg_ratio self.batch_size = 1024 self.clf = svm.SVC() def __get_pair_features(self, src_nds, target_nds): pair_features = list() if len(src_nds) != len(target_nds): self.logger.warn( 'The size of sampling in processing __get_pair_features is not equal.' ) yield pair_features for i in range(len(src_nds)): src_nd, target_nd = src_nds[i], target_nds[i] if not src_nd in self.graph['f'].G or not target_nd in self.graph[ 'g'].G: continue src_neighbor_anchors = set() for src_nd_to in self.graph['f'].G[src_nd]: if src_nd_to in self.L['f2g']['train']: src_neighbor_anchors.add(src_nd_to) target_neighbor_anchors = set() for target_nd_to in self.graph['g'].G[target_nd]: if target_nd_to in self.L['g2f']['train']: target_neighbor_anchors.add(target_nd_to) cnt_common_neighbors = .0 AA_measure = .0 for sna in src_neighbor_anchors: for k in range(len(self.L['f2g']['train'][sna])): target_anchor_nd = self.L['f2g']['train'][sna][k] if target_anchor_nd in target_neighbor_anchors: cnt_common_neighbors += 1. AA_measure += 1. / np.log( (len(self.graph['f'].G[sna]) + len(self.graph[ 'g'].G[self.L['f2g']['train'][sna][k]])) / 2.) jaccard = cnt_common_neighbors/(len(self.graph['f'].G[src_nd])\ +len(self.graph['g'].G[target_nd])-cnt_common_neighbors+1e-6) yield [cnt_common_neighbors, jaccard, AA_measure] def __batch_iter(self, lbs, batch_size, neg_ratio, lookup_src, lookup_obj, src_lb_tag, obj_lb_tag): train_lb_src2obj = lbs['{}2{}'.format(src_lb_tag, obj_lb_tag)]['train'] train_lb_obj2src = lbs['{}2{}'.format(obj_lb_tag, src_lb_tag)]['train'] train_size = len(train_lb_src2obj) start_index = 0 end_index = min(start_index + batch_size, train_size) src_lb_keys = train_lb_src2obj.keys() obj_lb_keys = train_lb_obj2src.keys() shuffle_indices = np.random.permutation(np.arange(train_size)) while start_index < end_index: pos_src = list() pos_obj = list() neg_src = list() neg_obj = list() for i in range(start_index, end_index): idx = shuffle_indices[i] src_lb = src_lb_keys[idx] obj_lbs = train_lb_src2obj[src_lb] for obj_lb in obj_lbs: cur_neg_src = list() cur_neg_obj = list() for k in range(neg_ratio): rand_obj_lb = None while not rand_obj_lb or rand_obj_lb in cur_neg_obj or rand_obj_lb in obj_lbs: rand_obj_lb_idx = random.randint( 0, len(obj_lb_keys) - 1) rand_obj_lb = obj_lb_keys[rand_obj_lb_idx] cur_neg_src.append(src_lb) cur_neg_obj.append(rand_obj_lb) pos_src.append(src_lb) pos_obj.append(obj_lb) neg_src.append(cur_neg_src) neg_obj.append(cur_neg_obj) start_index = end_index end_index = min(start_index + batch_size, train_size) yield pos_src, pos_obj, neg_src, neg_obj def train(self): batches_f2g = list(self.__batch_iter(self.L, self.batch_size, self.neg_ratio\ , self.look_up['f'], self.look_up['g'], 'f', 'g')) n_batches = len(batches_f2g) X = list() Y = list() for i in range(n_batches): pos_src_f2g, pos_obj_f2g, neg_src_f2g, neg_obj_f2g = batches_f2g[i] if not len(pos_src_f2g) == len(pos_obj_f2g) and not len( neg_src_f2g) == len(neg_obj_f2g): self.logger.info( 'The input label file goes wrong as the file format.') continue pos_features = list( self.__get_pair_features(pos_src_f2g, pos_obj_f2g)) X.extend(pos_features) Y.extend([1 for m in range(len(pos_features))]) for k in range(self.neg_ratio): neg_features = list( self.__get_pair_features(neg_src_f2g[k], neg_obj_f2g[k])) X.extend(neg_features) Y.extend([-1 for m in range(len(neg_features))]) self.logger.info('Training Model...') self.clf.fit(X, Y) self.logger.info('Complete Training process...')
def main(args): t1 = time.time() os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id # args.use_net=False logger = LogHandler('RUN.' + time.strftime('%Y-%m-%d', time.localtime(time.time()))) logger.info(args) SAVING_STEP = args.saving_step MAX_EPOCHS = args.epochs if args.method == 'pale': model = PALE(learning_rate=args.lr, batch_size=args.batch_size, n_input=args.input_size, n_hidden=args.hidden_size, n_layer=args.layers, files=args.embeddings + args.identity_linkage, type_model=args.type_model, is_valid=args.is_valid, log_file=args.log_file, device=args.device) losses = np.zeros(MAX_EPOCHS) val_scrs = np.zeros(MAX_EPOCHS) best_scr = .0 best_epoch = 0 thres = 100 for i in range(1, MAX_EPOCHS + 1): losses[i - 1], val_scrs[i - 1] = model.train_one_epoch() if i > 0 and i % SAVING_STEP == 0: loss_mean = np.mean(losses[i - SAVING_STEP:i]) scr_mean = np.mean(val_scrs[i - SAVING_STEP:i]) logger.info( 'loss in last {} epoches: {}, validation in last {} epoches: {}' .format(SAVING_STEP, loss_mean, SAVING_STEP, scr_mean)) if scr_mean > best_scr: best_scr = scr_mean best_epoch = i model.save_models(args.output) if args.early_stop and i >= thres * SAVING_STEP: cnt = 0 for k in range(thres - 1, -1, -1): cur_val = np.mean( val_scrs[i - (k + 1) * SAVING_STEP:i - k * SAVING_STEP]) if cur_val < best_scr: cnt += 1 if cnt == thres and (i - best_epoch) >= thres * SAVING_STEP: logger.info('*********early stop*********') logger.info( 'The best epoch: {}\nThe validation score: {}'. format(best_epoch, best_scr)) break if args.method == 'mna' or args.method == 'fruip': graph = defaultdict(Graph) print("Loading graph...") if len(args.graphs) != 2: logger.error('#####The input graphs must be pairwise!#####') sys.exit(1) if args.graph_format == 'adjlist': if args.graphs[0]: graph['f'].read_adjlist(filename=args.graphs[0]) if args.graphs[1]: graph['g'].read_adjlist(filename=args.graphs[1]) if args.graph_format == 'edgelist': if args.graphs[0]: graph['f'].read_edgelist(filename=args.graphs[0]) if args.graphs[1]: graph['g'].read_edgelist(filename=args.graphs[1]) if args.method == 'mna': model = MNA(graph=graph, attr_file=args.embeddings, anchorfile=args.identity_linkage, valid_prop=1.\ , use_net=args.use_net, neg_ratio=args.neg_ratio, log_file=args.log_file) if args.method == 'fruip': model = FRUIP(graph=graph, embed_files=args.embeddings, linkage_file=args.identity_linkage) model.main_proc(args.threshold) if args.method == 'final': main_proc(graph_files=args.graphs, graph_sizes=args.graph_sizes, linkage_file=args.identity_linkage, alpha=args.alpha, epoch=args.epochs, tol=args.tol, graph_format=args.graph_format, output_file=args.output) if args.method == 'crossmna': num_graphs = len(args.graphs) layer_graphs = [Graph() for i in range(num_graphs)] for k in range(num_graphs): graph_path = args.graphs[k] format_graph_path = '{}.crossmna'.format(graph_path) format_crossmna_graph(graph_path, format_graph_path, k) if args.graph_format == 'adjlist': layer_graphs[k].read_adjlist(filename=format_graph_path) if args.graph_format == 'edgelist': layer_graphs[k].read_edgelist(filename=format_graph_path) model = CROSSMNA(layer_graphs=layer_graphs, anchor_file=args.identity_linkage, lr=args.lr, batch_size=args.batch_size, nd_rep_size=args.nd_rep_size, layer_rep_size=args.layer_rep_size, epoch=args.epochs, negative_ratio=args.neg_ratio, table_size=args.table_size, outfile=args.output, log_file=args.log_file) if args.method in ['mna', 'fruip', 'pale']: model.save_model(args.output) t2 = time.time() print('time cost:', t2 - t1)
class _MNA(object): def __init__(self, graph, attr_file, anchorfile, use_net, valid_prop, neg_ratio, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) if not isinstance(graph, dict): self.logger.error('The graph must contain src and target graphs.') return self.use_net = use_net self.graph = graph self.lookup = dict() self.lookup['f'] = self.graph['f'].look_up_dict self.lookup['g'] = self.graph['g'].look_up_dict self.look_back = dict() self.look_back['f'] = self.graph['f'].look_back_list self.look_back['g'] = self.graph['g'].look_back_list self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop) self.attributes = dict() if attr_file: self.attributes['f'] = self._set_node_attributes(attr_file[0]) self.attributes['g'] = self._set_node_attributes(attr_file[1]) self.neg_ratio = neg_ratio self.batch_size = 1024 self.clf = svm.SVC(probability=True) def _set_node_attributes(self, attr_file): node_attributes = defaultdict(list) if not attr_file: return None with open(attr_file, 'r') as fin: for ln in fin: elems = ln.strip().split(',') node_attributes[elems[0]] = list(map(float, elems[1:])) return node_attributes def _get_pair_features(self, src_nds, target_nds): pair_features = list() if len(src_nds) != len(target_nds): self.logger.warn( 'The size of sampling in processing _get_pair_features is not equal.' ) yield pair_features for i in range(len(src_nds)): src_nd_idx, target_nd_idx = src_nds[i], target_nds[i] src_nd = self.look_back['f'][src_nd_idx] target_nd = self.look_back['g'][target_nd_idx] src_neighbor_anchors = set() for src_nd_to in self.graph['f'].G[src_nd]: if src_nd_to in self.L['f2g']['train']: src_neighbor_anchors.add(src_nd_to) target_neighbor_anchors = set() for target_nd_to in self.graph['g'].G[target_nd]: if target_nd_to in self.L['g2f']['train']: target_neighbor_anchors.add(target_nd_to) cnt_common_neighbors = .0 AA_measure = .0 for sna in src_neighbor_anchors: for k in range(len(self.L['f2g']['train'][sna])): target_anchor_nd = self.L['f2g']['train'][sna][k] if target_anchor_nd in target_neighbor_anchors: cnt_common_neighbors += 1. AA_measure += 1./np.log((len(self.graph['f'].G[sna])\ +len(self.graph['g'].G[self.L['f2g']['train'][sna][k]]))/2.) jaccard = cnt_common_neighbors/(len(self.graph['f'].G[src_nd])\ +len(self.graph['g'].G[target_nd])\ -cnt_common_neighbors+1e-6) # print(self.attributes['f'][src_nd], self.attributes['g'][target_nd]) feat_net = [] feat_attr = [] if self.use_net: feat_net = [cnt_common_neighbors, jaccard, AA_measure] if len(self.attributes) > 0: feat_len = len(self.attributes['f'][src_nd]) feat_attr = [1-self.attributes['f'][src_nd][k]\ +self.attributes['g'][target_nd][k] for k in range(feat_len)] # print(len(feat_net), len(feat_attr)) yield feat_net + feat_attr def train(self): batches_f2g = batch_iter(self.L, self.batch_size, self.neg_ratio, self.lookup, 'f', 'g') X = list() Y = list() for batch in batches_f2g: pos, neg = batch if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len( neg['g']): self.logger.info( 'The input label file goes wrong as the file format.') continue pos_features = list(self._get_pair_features(pos['f'], pos['g'])) # print('feat_len (pos):',len(pos_features[0])) X.extend(pos_features) Y.extend([1 for m in range(len(pos_features))]) for k in range(self.neg_ratio): neg_features = list( self._get_pair_features(neg['f'][k], neg['g'][k])) X.extend(neg_features) # print('feat_len (neg):',len(neg_features[0])) Y.extend([-1 for m in range(len(neg_features))]) self.logger.info('Training Model...') print(len(X), len(X[0]), len(Y)) self.clf.fit(X, Y) print(self.clf) self.logger.info('Training score: %f' % self.clf.score(X, Y)) self.logger.info('Complete Training process...')
class _LINE_ANCHORREG_ALIGN_PRETRAIN(object): def __init__(self, graph, lr=.001, gamma=.1, rep_size=128, batch_size=100, negative_ratio=5\ , order=3, table_size=1e8, embedfile=None, anchorfile=None, log_file='log'): if not embedfile or not anchorfile: return if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) self.epsilon = 1e-7 self.table_size = table_size self.sigmoid_table = {} self.sigmoid_table_size = 1000 self.SIGMOID_BOUND = 6 self._init_simgoid_table() self.g = graph self.look_up = self.g.look_up_dict self.idx = defaultdict(int) self.update_dict = defaultdict(dict) self.update_look_back = defaultdict(list) self.node_size = graph.G.number_of_nodes() self.rep_size = rep_size self.order = order self.lr = lr self.gamma = gamma self.cur_epoch = 0 self.batch_size = batch_size self.negative_ratio = negative_ratio self._gen_sampling_table() self._init_params(self.node_size, rep_size, embedfile, anchorfile) def _init_params(self, node_size, rep_size, embedfile, anchorfile): self.embeddings = dict() self.embeddings['order1'] = np.random.normal(0, 1, (node_size, rep_size)) self.embeddings['order2'] = np.random.normal(0, 1, (node_size, rep_size)) self.embeddings['content'] = np.random.normal(0, 1, (node_size, rep_size)) self.embeddings['order1'] = self._set_anchor_nds( self.embeddings['order1'], embedfile, anchorfile, 1) self.embeddings['order2'] = self._set_anchor_nds( self.embeddings['order2'], embedfile, anchorfile, 2) self._init_update_params(node_size, rep_size) self._pre_train() def _init_update_params(self, node_size, rep_size): # adagrad self.h_delta = dict() self.h_delta['order1'] = np.zeros((node_size, rep_size)) self.h_delta['order2'] = np.zeros((node_size, rep_size)) self.h_delta['content'] = np.zeros((node_size, rep_size)) # adam self.m = dict() self.m['order1'] = np.zeros((node_size, rep_size)) self.m['order2'] = np.zeros((node_size, rep_size)) self.m['content'] = np.zeros((node_size, rep_size)) self.v = dict() self.v['order1'] = np.zeros((node_size, rep_size)) self.v['order2'] = np.zeros((node_size, rep_size)) self.v['content'] = np.zeros((node_size, rep_size)) self.t = 1 def _read_anchors(self, anchorfile): anchors = list() with open(anchorfile, 'r') as anchor_handler: for ln in anchor_handler: elems = ln.strip().split() anchors.append((elems[0], elems[1])) return anchors def _read_embeddings(self, embedfile): embeddings = dict() with open(embedfile, 'r') as embed_handler: for ln in embed_handler: elems = ln.strip().split() if len(elems) <= 2: continue embeddings[elems[0]] = map(float, elems[1:]) return embeddings def _set_anchor_nds(self, mat, embedfile, anchorfile, order): self.anchors = self._read_anchors(anchorfile) self.src_embeddings = self._read_embeddings(embedfile) self.anchor_idx = set() for src_nd, target_nd in self.anchors: if not target_nd in self.look_up or not src_nd in self.src_embeddings: continue if len(mat[self.look_up[target_nd]]) != len( self.src_embeddings[src_nd]): self.logger.error( 'The length of embeddings at anchor nodes are illegal') break self.anchor_idx.add(self.look_up[target_nd]) # if order==1: # mat[self.look_up[target_nd]] = self.src_embeddings[src_nd][0:len(mat[self.look_up[target_nd]])] # if order==2: # mat[self.look_up[target_nd]] = self.src_embeddings[src_nd][len(mat[self.look_up[target_nd]]):] mat[self.look_up[target_nd]] = self.src_embeddings[src_nd] return mat def _pre_train(self): self.logger.info("Pretraining...") DISPLAY_EPOCH = 1000 order = self.order batches = self.batch_iter() opt_type = 'adam' for batch in batches: self.idx = defaultdict(int) self.update_look_back = defaultdict(list) self.update_dict = defaultdict(dict) if order == 1 or order == 3: delta_eh_o1 = self._pretrain_update_graph_by_order1(batch) len_delta = len(delta_eh_o1) # print 'order1 nd' if opt_type == 'adagrad': self.h_delta['order1'], self.embeddings['order1'] = \ self.update_vec('nd_order1', self.h_delta['order1'], delta_eh_o1 , self.embeddings['order1'], len_delta, self.t) if opt_type == 'adam': self.m['order1'], self.v['order1'], self.embeddings['order1'] = \ self.update_vec_by_adam('nd_order1', self.m['order1'], self.v['order1'], delta_eh_o1 , self.embeddings['order1'], len_delta, self.t) if order == 2 or order == 3: delta_c, delta_eh_o2 = self._pretrain_update_graph_by_order2( batch) len_delta = len(delta_eh_o2) # print 'order2, nd' if opt_type == 'adagrad': self.h_delta['order2'], self.embeddings['order2'] = \ self.update_vec('nd_order2', self.h_delta['order2'], delta_eh_o2 , self.embeddings['order2'], len_delta, self.t) if opt_type == 'adam': self.m['order2'], self.v['order2'], self.embeddings['order2'] = \ self.update_vec_by_adam('nd_order2', self.m['order2'], self.v['order2'], delta_eh_o2 , self.embeddings['order2'], len_delta, self.t) len_content = len(delta_c) # print 'order2, content' if opt_type == 'adagrad': self.h_delta_c, self.embeddings['content'] = \ self.update_vec('cnt_order2', self.h_delta['content'], delta_c , self.embeddings['content'], len_content, self.t) if opt_type == 'adam': self.m['content'], self.v['content'], self.embeddings['content'] = \ self.update_vec_by_adam('cnt_order2', self.m['content'], self.v['content'], delta_c , self.embeddings['content'], len_content, self.t) # self.embeddings_order2[self.update_look_back[:len_de],:] -= self.lr*delta_eh # len_content = len(delta_c) # self.content_embeddings[self.update_look_back[:len_content],:] -= self.lr*delta_c # break if (self.t - 1) % DISPLAY_EPOCH == 0: self.get_cur_batch_loss(self.t, batch) self.t += 1 self._init_update_params(self.node_size, self.rep_size) self.logger.info("End of Pretraining") def _init_simgoid_table(self): for k in range(self.sigmoid_table_size): x = 2 * self.SIGMOID_BOUND * k / self.sigmoid_table_size - self.SIGMOID_BOUND self.sigmoid_table[k] = 1. / (1 + np.exp(-x)) def _fast_sigmoid(self, val): if val > self.SIGMOID_BOUND: return 1 - self.epsilon elif val < -self.SIGMOID_BOUND: return self.epsilon k = int((val + self.SIGMOID_BOUND) * self.sigmoid_table_size / self.SIGMOID_BOUND / 2) return self.sigmoid_table[k] # return 1./(1+np.exp(-val)) def _pretrain_update_graph_by_order2(self, batch): ''' x = self._binarize(self.embeddings[key]) ''' pos_h, pos_t, pos_h_v, neg_t = batch batch_size = len(pos_h) # print pos_h, pos_t, pos_h_v, neg_t # order 2 pos_u = self.embeddings['order2'][pos_h, :] pos_v_c = self.embeddings['content'][pos_t, :] neg_u = self.embeddings['order2'][pos_h_v, :] neg_v_c = self.embeddings['content'][neg_t, :] pos_e = np.sum(pos_u * pos_v_c, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v_c, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) # temporal delta delta_eh = list() delta_c = list() for i in range(len(pos_t)): u, v = pos_h[i], pos_t[i] if not v in self.anchor_idx: delta_c = self._calc_delta_vec('cnt_order2', v, delta_c, (sigmoid_pos_e[i] - 1) * pos_u[i, :]) if not u in self.anchor_idx: delta_eh = self._calc_delta_vec('nd_order2', u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v_c[i, :]) # print 'delta_eh',delta_eh,ndDict_order neg_shape = neg_e.shape for i in range(neg_shape[0]): for j in range(neg_shape[1]): u, v = pos_h_v[i][j], neg_t[i][j] if not v in self.anchor_idx: delta_c = self._calc_delta_vec( 'cnt_order2', v, delta_c, sigmoid_neg_e[i, j] * neg_u[i, j, :]) if not u in self.anchor_idx: delta_eh = self._calc_delta_vec( 'nd_order2', u, delta_eh, sigmoid_neg_e[i, j] * neg_v_c[i, j, :]) # print sigmoid_neg_e[i,j]*neg_v_c[i,j,:], type(sigmoid_neg_e[i,j]*neg_v_c[i,j,:]) # print 'delta_eh',delta_eh,ndDict_order # delta x & delta codebook delta_eh = self._format_vec('nd_order2', delta_eh) delta_c = self._format_vec('cnt_order2', delta_c) return delta_c / batch_size, delta_eh / batch_size def _pretrain_update_graph_by_order1(self, batch): ''' x = self._binarize(self.embeddings[key]) ''' pos_h, pos_t, pos_h_v, neg_t = batch batch_size = len(pos_h) # order 1 pos_u = self.embeddings['order1'][pos_h, :] pos_v = self.embeddings['order1'][pos_t, :] neg_u = self.embeddings['order1'][pos_h_v, :] neg_v = self.embeddings['order1'][neg_t, :] pos_e = np.sum(pos_u * pos_v, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) # delta calculation delta_eh = list() for i in range(len(pos_t)): u, v = pos_h[i], pos_t[i] if not v in self.anchor_idx: delta_eh = self._calc_delta_vec('nd_order1', v, delta_eh, (sigmoid_pos_e[i] - 1) * pos_u[i, :]) if not u in self.anchor_idx: delta_eh = self._calc_delta_vec('nd_order1', u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v[i, :]) neg_shape = neg_e.shape for i in range(neg_shape[0]): for j in range(neg_shape[1]): u, v = pos_h_v[i][j], neg_t[i][j] if not v in self.anchor_idx: delta_eh = self._calc_delta_vec( 'nd_order1', v, delta_eh, sigmoid_neg_e[i, j] * neg_u[i, j, :]) if not u in self.anchor_idx: delta_eh = self._calc_delta_vec( 'nd_order1', u, delta_eh, sigmoid_neg_e[i, j] * neg_v[i, j, :]) # delta x & delta codebook delta_eh = self._format_vec('nd_order1', delta_eh) return delta_eh / batch_size def _cos_sim(self, vec1, vec2): return np.dot(vec1, vec2) / np.linalg.norm(vec1) / np.linalg.norm(vec2) def _update_graph_by_anchor_reg(self): delta_eh = list() cnt = 0 for src_nd, target_nd in self.anchors: if not src_nd in self.src_embeddings or not target_nd in self.look_up: continue src_emb = np.array(self.src_embeddings[src_nd]) if self.order == 2: target_emb = self.embeddings['order2'][self.look_up[target_nd]] if self.order == 1: target_emb = self.embeddings['order1'][self.look_up[target_nd]] delta_eh = self._calc_delta_vec( 'nd_order2', self.look_up[target_nd], delta_eh, (self._cos_sim(src_emb, target_emb) * target_emb / np.dot(target_emb, target_emb) - src_emb / np.linalg.norm(src_emb) / np.linalg.norm(target_emb)) / self._cos_sim(src_emb, target_emb)) cnt += 1 if self.order == 2: delta_eh = self._format_vec('nd_order2', delta_eh) if self.order == 1: delta_eh = self._format_vec('nd_order1', delta_eh) return delta_eh / cnt def _format_vec(self, cal_type, vec): len_gap = self.idx[cal_type] - len(vec) if len_gap > 0: for i in range(len_gap): if isinstance(vec, list): vec.append(np.zeros(vec[0].shape)) else: vec = np.vstack((vec, np.zeros(vec[0].shape))) return np.array(vec) def _calc_delta_vec(self, cal_type, nd, delta, opt_vec): if nd not in self.update_dict[cal_type]: cur_idx = self.idx[cal_type] self.update_dict[cal_type][nd] = cur_idx self.update_look_back[cal_type].append(nd) self.idx[cal_type] += 1 else: cur_idx = self.update_dict[cal_type][nd] if cur_idx >= len(delta): for i in range(cur_idx - len(delta)): delta.append(np.zeros(opt_vec.shape)) delta.append(opt_vec) else: delta[cur_idx] += opt_vec return delta def _update_graph_by_order2(self, batch): ''' x = self._binarize(self.embeddings[key]) ''' pos_h, pos_t, pos_h_v, neg_t = batch batch_size = len(pos_h) # print pos_h, pos_t, pos_h_v, neg_t # order 2 pos_u = self.embeddings['order2'][pos_h, :] pos_v_c = self.embeddings['content'][pos_t, :] neg_u = self.embeddings['order2'][pos_h_v, :] neg_v_c = self.embeddings['content'][neg_t, :] pos_e = np.sum(pos_u * pos_v_c, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v_c, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) # temporal delta delta_eh = list() delta_c = list() for i in range(len(pos_t)): u, v = pos_h[i], pos_t[i] delta_c = self._calc_delta_vec( 'cnt_order2', v, delta_c, (sigmoid_pos_e[i] - 1) * pos_u[i, :]) delta_eh = self._calc_delta_vec('nd_order2', u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v_c[i, :]) # print 'delta_eh',delta_eh,ndDict_order neg_shape = neg_e.shape for i in range(neg_shape[0]): for j in range(neg_shape[1]): u, v = pos_h_v[i][j], neg_t[i][j] delta_c = self._calc_delta_vec( 'cnt_order2', v, delta_c, sigmoid_neg_e[i, j] * neg_u[i, j, :]) delta_eh = self._calc_delta_vec( 'nd_order2', u, delta_eh, sigmoid_neg_e[i, j] * neg_v_c[i, j, :]) # print sigmoid_neg_e[i,j]*neg_v_c[i,j,:], type(sigmoid_neg_e[i,j]*neg_v_c[i,j,:]) # print 'delta_eh',delta_eh,ndDict_order # delta x & delta codebook delta_eh = self._format_vec('nd_order2', delta_eh) delta_c = self._format_vec('cnt_order2', delta_c) return delta_c / batch_size, delta_eh / batch_size def _update_graph_by_order1(self, batch): ''' x = self._binarize(self.embeddings[key]) ''' pos_h, pos_t, pos_h_v, neg_t = batch batch_size = len(pos_h) # order 1 pos_u = self.embeddings['order1'][pos_h, :] pos_v = self.embeddings['order1'][pos_t, :] neg_u = self.embeddings['order1'][pos_h_v, :] neg_v = self.embeddings['order1'][neg_t, :] pos_e = np.sum(pos_u * pos_v, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) # delta calculation delta_eh = list() for i in range(len(pos_t)): u, v = pos_h[i], pos_t[i] delta_eh = self._calc_delta_vec( 'nd_order1', v, delta_eh, (sigmoid_pos_e[i] - 1) * pos_u[i, :]) delta_eh = self._calc_delta_vec( 'nd_order1', u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v[i, :]) neg_shape = neg_e.shape for i in range(neg_shape[0]): for j in range(neg_shape[1]): u, v = pos_h_v[i][j], neg_t[i][j] delta_eh = self._calc_delta_vec( 'nd_order1', v, delta_eh, sigmoid_neg_e[i, j] * neg_u[i, j, :]) delta_eh = self._calc_delta_vec( 'nd_order1', u, delta_eh, sigmoid_neg_e[i, j] * neg_v[i, j, :]) # delta x & delta codebook delta_eh = self._format_vec('nd_order1', delta_eh) return delta_eh / batch_size def _mat_add(self, mat1, mat2): # print '****mat add****' # print mat1, mat2 len_gap = len(mat1) - len(mat2) # print len_gap if len_gap > 0: for i in range(len_gap): mat2 = np.vstack((mat2, np.zeros(mat2[0, :].shape))) # print mat2 else: for i in range(-len_gap): mat1 = np.vstack((mat1, np.zeros(mat1[0, :].shape))) # print mat1 # print len(mat1), len(mat2) return mat1 + mat2 def get_anchor_reg_loss(self): cos_sim_list = list() for src_nd, target_nd in self.anchors: if not src_nd in self.src_embeddings or not target_nd in self.look_up: continue src_emb = np.array(self.src_embeddings[src_nd]) target_emb = self.embeddings['order2'][self.look_up[target_nd]] cos_sim_list.append(self._cos_sim(src_emb, target_emb)) return -np.mean(cos_sim_list) def get_graph_loss_by_order2(self, batch): pos_h, pos_t, pos_h_v, neg_t = batch # order 2 pos_u = self.embeddings['order2'][pos_h, :] pos_v_c = self.embeddings['content'][pos_t, :] neg_u = self.embeddings['order2'][pos_h_v, :] neg_v_c = self.embeddings['content'][neg_t, :] pos_e = np.sum(pos_u * pos_v_c, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v_c, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) return -np.mean( np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1)) def get_graph_loss_by_order1(self, batch): pos_h, pos_t, pos_h_v, neg_t = batch # order 2 pos_u = self.embeddings['order1'][pos_h, :] pos_v = self.embeddings['order1'][pos_t, :] neg_u = self.embeddings['order1'][pos_h_v, :] neg_v = self.embeddings['order1'][neg_t, :] # pos_e_1 = np.sum(pos_u*pos_v, axis=1)+np.sum(self.b_e[key][0][pos_t,:], axis=1) # pos_e.shape = batch_size # neg_e_1 = np.sum(neg_u*neg_v, axis=2)+np.sum(self.b_e[key][0][neg_t,:], axis=2) # neg_e.shape = batch_size*negative_ratio pos_e = np.sum(pos_u * pos_v, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) return -np.mean( np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1)) def get_cur_batch_loss(self, t, batch): DISPLAY_EPOCH = 1 if t % DISPLAY_EPOCH == 0: loss_order_1 = 0.0 loss_order_2 = 0.0 if self.order == 1 or self.order == 3: loss_order_1 += self.get_graph_loss_by_order1(batch) if self.order == 2 or self.order == 3: anchor_loss = self.get_anchor_reg_loss() loss_order_2 += self.get_graph_loss_by_order2( batch) + anchor_loss if self.order == 1: self.logger.info( 'Finish processing batch {} and loss from order 1:{}'. format(t, loss_order_1)) elif self.order == 2: self.logger.info( 'Finish processing batch {} and loss from order 2:{} and anchor loss:{}' .format(t, loss_order_2, anchor_loss)) elif self.order == 3: self.logger.info( 'Finish processing batch {} and loss from order 3:{}'. format(t, loss_order_1 + loss_order_2)) def update_vec(self, cal_type, h_delta, delta, embeddings, len_delta, t): h_delta[self.update_look_back[cal_type][:len_delta], :] += delta**2 # print 'original embedding:',embeddings[self.update_look_back[cal_type][:len_delta]] embeddings[self.update_look_back[cal_type][:len_delta],:] -= \ self.lr/np.sqrt(h_delta[self.update_look_back[cal_type][:len_delta],:])*delta # print 'delta:',delta # print 'h_delta:',h_delta[self.update_look_back[cal_type][:len_delta]] # print 'embeddings:',embeddings[self.update_look_back[cal_type][:len_delta]] # print 'lmd_rda:',elem_lbd return h_delta, embeddings def update_vec_by_adam(self, cal_type, m, v, delta, embeddings, len_delta, t): self.beta1 = .9 self.beta2 = .999 m[self.update_look_back[cal_type][:len_delta],:] = \ self.beta1*m[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta1)*delta v[self.update_look_back[cal_type][:len_delta],:] = \ self.beta2*v[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta2)*(delta**2) m_ = m[self.update_look_back[cal_type][:len_delta], :] / ( 1 - self.beta1**t) v_ = v[self.update_look_back[cal_type][:len_delta], :] / ( 1 - self.beta2**t) embeddings[ self.update_look_back[cal_type][:len_delta], :] -= self.lr * m_ / ( np.sqrt(v_) + self.epsilon) return m, v, embeddings def train_one_epoch(self): DISPLAY_EPOCH = 1000 order = self.order batches = self.batch_iter() opt_type = 'adam' for batch in batches: self.idx = defaultdict(int) self.update_look_back = defaultdict(list) self.update_dict = defaultdict(dict) if order == 1 or order == 3: delta_eh_o1 = self._update_graph_by_order1(batch) len_delta = len(delta_eh_o1) # print 'order1 nd' if opt_type == 'adagrad': self.h_delta['order1'], self.embeddings['order1'] = \ self.update_vec('nd_order1', self.h_delta['order1'], delta_eh_o1 , self.embeddings['order1'], len_delta, self.t) if opt_type == 'adam': self.m['order1'], self.v['order1'], self.embeddings['order1'] = \ self.update_vec_by_adam('nd_order1', self.m['order1'], self.v['order1'], delta_eh_o1 , self.embeddings['order1'], len_delta, self.t) if order == 2 or order == 3: delta_c, delta_eh_o2 = self._update_graph_by_order2(batch) delta_eh_anchor_reg = self._update_graph_by_anchor_reg() delta_eh_o2 = self._format_vec('nd_order2', delta_eh_o2) len_delta = len(delta_eh_o2) # print 'order2, nd' if opt_type == 'adagrad': self.h_delta['order2'], self.embeddings['order2'] = \ self.update_vec('nd_order2', self.h_delta['order2'] , delta_eh_o2+self.gamma*delta_eh_anchor_reg , self.embeddings['order2'], len_delta, self.t) if opt_type == 'adam': self.m['order2'], self.v['order2'], self.embeddings['order2'] = \ self.update_vec_by_adam('nd_order2', self.m['order2'], self.v['order2'] , delta_eh_o2+self.gamma*delta_eh_anchor_reg , self.embeddings['order2'], len_delta, self.t) len_content = len(delta_c) # print 'order2, content' if opt_type == 'adagrad': self.h_delta_c, self.embeddings['content'] = \ self.update_vec('cnt_order2', self.h_delta['content'], delta_c , self.embeddings['content'], len_content, self.t) if opt_type == 'adam': self.m['content'], self.v['content'], self.embeddings['content'] = \ self.update_vec_by_adam('cnt_order2', self.m['content'], self.v['content'], delta_c , self.embeddings['content'], len_content, self.t) # self.embeddings_order2[self.update_look_back[:len_de],:] -= self.lr*delta_eh # len_content = len(delta_c) # self.content_embeddings[self.update_look_back[:len_content],:] -= self.lr*delta_c # break if (self.t - 1) % DISPLAY_EPOCH == 0: self.get_cur_batch_loss(self.t, batch) self.t += 1 self.cur_epoch += 1 def get_random_node_pairs(self, i, shuffle_indices, edges, edge_set, numNodes): # balance the appearance of edges according to edge_prob if not random.random() < self.edge_prob[shuffle_indices[i]]: shuffle_indices[i] = self.edge_alias[shuffle_indices[i]] cur_h = edges[shuffle_indices[i]][0] head = cur_h * numNodes cur_t = edges[shuffle_indices[i]][1] cur_h_v = [] cur_neg_t = [] for j in range(self.negative_ratio): rn = self.sampling_table[random.randint(0, self.table_size - 1)] while head + rn in edge_set or cur_h == rn or rn in cur_neg_t: rn = self.sampling_table[random.randint( 0, self.table_size - 1)] idx = random.randint(0, self.table_size - 1) cur_h_v.append(cur_h) cur_neg_t.append(rn) return cur_h, cur_t, cur_h_v, cur_neg_t def batch_iter(self): numNodes = self.node_size edges = [(self.look_up[x[0]], self.look_up[x[1]]) for x in self.g.G.edges()] data_size = self.g.G.number_of_edges() edge_set = set([x[0] * numNodes + x[1] for x in edges]) shuffle_indices = np.random.permutation(np.arange(data_size)) start_index = 0 end_index = min(start_index + self.batch_size, data_size) while start_index < data_size: ret = {} pos_h = [] pos_t = [] pos_h_v = [] neg_t = [] for i in range(start_index, end_index): cur_h, cur_t, cur_h_v, cur_neg_t = self.get_random_node_pairs( i, shuffle_indices, edges, edge_set, numNodes) pos_h.append(cur_h) pos_t.append(cur_t) pos_h_v.append(cur_h_v) neg_t.append(cur_neg_t) ret = (pos_h, pos_t, pos_h_v, neg_t) start_index = end_index end_index = min(start_index + self.batch_size, data_size) yield ret def _gen_sampling_table(self): table_size = self.table_size power = 0.75 numNodes = self.node_size print "Pre-procesing for non-uniform negative sampling!" self.node_degree = np.zeros(numNodes) # out degree look_up = self.g.look_up_dict for edge in self.g.G.edges(): self.node_degree[look_up[edge[0]]] += self.g.G[edge[0]][ edge[1]]["weight"] norm = sum( [math.pow(self.node_degree[i], power) for i in range(numNodes)]) self.sampling_table = np.zeros(int(table_size), dtype=np.uint32) p = 0 i = 0 for j in range(numNodes): p += float(math.pow(self.node_degree[j], power)) / norm while i < table_size and float(i) / table_size < p: self.sampling_table[i] = j i += 1 data_size = self.g.G.number_of_edges() self.edge_alias = np.zeros(data_size, dtype=np.int32) self.edge_prob = np.zeros(data_size, dtype=np.float32) large_block = np.zeros(data_size, dtype=np.int32) small_block = np.zeros(data_size, dtype=np.int32) total_sum = sum([ self.g.G[edge[0]][edge[1]]["weight"] for edge in self.g.G.edges() ]) norm_prob = [ self.g.G[edge[0]][edge[1]]["weight"] * data_size / total_sum for edge in self.g.G.edges() ] num_small_block = 0 num_large_block = 0 cur_small_block = 0 cur_large_block = 0 for k in range(data_size - 1, -1, -1): if norm_prob[k] < 1: small_block[num_small_block] = k num_small_block += 1 else: large_block[num_large_block] = k num_large_block += 1 while num_small_block and num_large_block: num_small_block -= 1 cur_small_block = small_block[num_small_block] num_large_block -= 1 cur_large_block = large_block[num_large_block] self.edge_prob[cur_small_block] = norm_prob[cur_small_block] self.edge_alias[cur_small_block] = cur_large_block norm_prob[cur_large_block] = norm_prob[ cur_large_block] + norm_prob[cur_small_block] - 1 if norm_prob[cur_large_block] < 1: small_block[num_small_block] = cur_large_block num_small_block += 1 else: large_block[num_large_block] = cur_large_block num_large_block += 1 while num_large_block: num_large_block -= 1 self.edge_prob[large_block[num_large_block]] = 1 while num_small_block: num_small_block -= 1 self.edge_prob[small_block[num_small_block]] = 1 def save_embeddings(self, outfile): vectors = self.get_vectors() for c in vectors.keys(): if 'node_embeddings' in c or 'content_embeddings' in c: # outfile-[node_embeddings/content-embeddings]-[src/obj] fout = open('{}.{}'.format(outfile, c), 'w') node_num = len(vectors[c].keys()) fout.write("{} {}\n".format(node_num, self.rep_size)) for node, vec in vectors[c].items(): fout.write("{} {}\n".format( node, ' '.join([str(x) for x in vec]))) fout.close() if self.order == 3: fout = open('{}.node_embedding_all'.format(outfile), 'w') node_num = len(vectors[c].keys()) fout.write("{} {}\n".format(node_num, self.rep_size * 2)) for node, vec in vectors['node_embeddings_order1'].items(): fout.write("{} {} {}\n".format( node, ' '.join([str(x) for x in vec]), ' '.join([ str(x) for x in vectors['node_embeddings_order2'][node] ]))) fout.close() def get_one_embeddings(self, embeddings): vectors = dict() look_back = self.g.look_back_list for i, embedding in enumerate(embeddings): vectors[look_back[i]] = embedding return vectors def get_vectors(self): order = self.order ret = dict() node_embeddings_order1 = self.get_one_embeddings( self.embeddings['order1']) ret['node_embeddings_order1'] = node_embeddings_order1 node_embeddings_order2 = self.get_one_embeddings( self.embeddings['order2']) ret['node_embeddings_order2'] = node_embeddings_order2 if order == 2 or order == 3: content_embeddings = dict() content_embeddings = self.get_one_embeddings( self.embeddings['content']) ret['content_embeddings'] = content_embeddings return ret
class _MNA(object): def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) if not isinstance(graph, dict): self.logger.error('The graph must contain src and target graphs.') return self.graph = graph self.lookup = dict() self.lookup['f'] = self.graph['f'].look_up_dict self.lookup['g'] = self.graph['g'].look_up_dict self.look_back = dict() self.look_back['f'] = self.graph['f'].look_back_list self.look_back['g'] = self.graph['g'].look_back_list self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop) self.neg_ratio = neg_ratio self.batch_size = 1024 self.clf = svm.SVC(probability=True) def __get_pair_features(self, src_nds, target_nds): pair_features = list() if len(src_nds) != len(target_nds): self.logger.warn( 'The size of sampling in processing __get_pair_features is not equal.' ) yield pair_features for i in range(len(src_nds)): src_nd, target_nd = src_nds[i], target_nds[i] src_neighbor_anchors = set() for src_nd_to in self.graph['f'].G[self.look_back['f'][src_nd]]: if src_nd_to in self.L['f2g']['train']: src_neighbor_anchors.add(src_nd_to) target_neighbor_anchors = set() for target_nd_to in self.graph['g'].G[self.look_back['g'] [target_nd]]: if target_nd_to in self.L['g2f']['train']: target_neighbor_anchors.add(target_nd_to) cnt_common_neighbors = .0 AA_measure = .0 for sna in src_neighbor_anchors: for k in range(len(self.L['f2g']['train'][sna])): target_anchor_nd = self.L['f2g']['train'][sna][k] if target_anchor_nd in target_neighbor_anchors: cnt_common_neighbors += 1. AA_measure += 1./np.log((len(self.graph['f'].G[sna])\ +len(self.graph['g'].G[self.L['f2g']['train'][sna][k]]))/2.) jaccard = cnt_common_neighbors/(len(self.graph['f'].G[self.look_back['f'][src_nd]])\ +len(self.graph['g'].G[self.look_back['g'][target_nd]])\ -cnt_common_neighbors+1e-6) yield [cnt_common_neighbors, jaccard, AA_measure] def train(self): batches_f2g = batch_iter(self.L, self.batch_size, self.neg_ratio, self.lookup, 'f', 'g') X = list() Y = list() for batch in batches_f2g: pos, neg = batch if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len( neg['g']): self.logger.info( 'The input label file goes wrong as the file format.') continue pos_features = list(self.__get_pair_features(pos['f'], pos['g'])) X.extend(pos_features) Y.extend([1 for m in range(len(pos_features))]) for k in range(self.neg_ratio): neg_features = list( self.__get_pair_features(neg['f'][k], neg['g'][k])) X.extend(neg_features) Y.extend([-1 for m in range(len(neg_features))]) self.logger.info('Training Model...') self.clf.fit(X, Y) self.logger.info('Training score: %f' % self.clf.score(X, Y)) self.logger.info('Complete Training process...')