class _MNA(object): def __init__(self, graph, attr_file, anchorfile, use_net, valid_prop, neg_ratio, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) if not isinstance(graph, dict): self.logger.error('The graph must contain src and target graphs.') return self.use_net = use_net self.graph = graph self.lookup = dict() self.lookup['f'] = self.graph['f'].look_up_dict self.lookup['g'] = self.graph['g'].look_up_dict self.look_back = dict() self.look_back['f'] = self.graph['f'].look_back_list self.look_back['g'] = self.graph['g'].look_back_list self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop) self.attributes = dict() if attr_file: self.attributes['f'] = self._set_node_attributes(attr_file[0]) self.attributes['g'] = self._set_node_attributes(attr_file[1]) self.neg_ratio = neg_ratio self.batch_size = 1024 self.clf = svm.SVC(probability=True) def _set_node_attributes(self, attr_file): node_attributes = defaultdict(list) if not attr_file: return None with open(attr_file, 'r') as fin: for ln in fin: elems = ln.strip().split(',') node_attributes[elems[0]] = list(map(float, elems[1:])) return node_attributes def _get_pair_features(self, src_nds, target_nds): pair_features = list() if len(src_nds) != len(target_nds): self.logger.warn( 'The size of sampling in processing _get_pair_features is not equal.' ) yield pair_features for i in range(len(src_nds)): src_nd_idx, target_nd_idx = src_nds[i], target_nds[i] src_nd = self.look_back['f'][src_nd_idx] target_nd = self.look_back['g'][target_nd_idx] src_neighbor_anchors = set() for src_nd_to in self.graph['f'].G[src_nd]: if src_nd_to in self.L['f2g']['train']: src_neighbor_anchors.add(src_nd_to) target_neighbor_anchors = set() for target_nd_to in self.graph['g'].G[target_nd]: if target_nd_to in self.L['g2f']['train']: target_neighbor_anchors.add(target_nd_to) cnt_common_neighbors = .0 AA_measure = .0 for sna in src_neighbor_anchors: for k in range(len(self.L['f2g']['train'][sna])): target_anchor_nd = self.L['f2g']['train'][sna][k] if target_anchor_nd in target_neighbor_anchors: cnt_common_neighbors += 1. AA_measure += 1./np.log((len(self.graph['f'].G[sna])\ +len(self.graph['g'].G[self.L['f2g']['train'][sna][k]]))/2.) jaccard = cnt_common_neighbors/(len(self.graph['f'].G[src_nd])\ +len(self.graph['g'].G[target_nd])\ -cnt_common_neighbors+1e-6) # print(self.attributes['f'][src_nd], self.attributes['g'][target_nd]) feat_net = [] feat_attr = [] if self.use_net: feat_net = [cnt_common_neighbors, jaccard, AA_measure] if len(self.attributes) > 0: feat_len = len(self.attributes['f'][src_nd]) feat_attr = [1-self.attributes['f'][src_nd][k]\ +self.attributes['g'][target_nd][k] for k in range(feat_len)] # print(len(feat_net), len(feat_attr)) yield feat_net + feat_attr def train(self): batches_f2g = batch_iter(self.L, self.batch_size, self.neg_ratio, self.lookup, 'f', 'g') X = list() Y = list() for batch in batches_f2g: pos, neg = batch if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len( neg['g']): self.logger.info( 'The input label file goes wrong as the file format.') continue pos_features = list(self._get_pair_features(pos['f'], pos['g'])) # print('feat_len (pos):',len(pos_features[0])) X.extend(pos_features) Y.extend([1 for m in range(len(pos_features))]) for k in range(self.neg_ratio): neg_features = list( self._get_pair_features(neg['f'][k], neg['g'][k])) X.extend(neg_features) # print('feat_len (neg):',len(neg_features[0])) Y.extend([-1 for m in range(len(neg_features))]) self.logger.info('Training Model...') print(len(X), len(X[0]), len(Y)) self.clf.fit(X, Y) print(self.clf) self.logger.info('Training score: %f' % self.clf.score(X, Y)) self.logger.info('Complete Training process...')
class _MNA(object): def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) if not isinstance(graph, dict): self.logger.error('The graph must contain src and target graphs.') return self.L = load_train_valid_labels(anchorfile, valid_prop) self.graph = graph self.look_up = dict() self.look_up['f'] = self.graph['f'].look_up_dict self.look_up['g'] = self.graph['g'].look_up_dict self.look_back = dict() self.look_back['f'] = self.graph['f'].look_back_list self.look_back['g'] = self.graph['g'].look_back_list self.neg_ratio = neg_ratio self.batch_size = 1024 self.clf = svm.SVC() def __get_pair_features(self, src_nds, target_nds): pair_features = list() if len(src_nds) != len(target_nds): self.logger.warn( 'The size of sampling in processing __get_pair_features is not equal.' ) yield pair_features for i in range(len(src_nds)): src_nd, target_nd = src_nds[i], target_nds[i] if not src_nd in self.graph['f'].G or not target_nd in self.graph[ 'g'].G: continue src_neighbor_anchors = set() for src_nd_to in self.graph['f'].G[src_nd]: if src_nd_to in self.L['f2g']['train']: src_neighbor_anchors.add(src_nd_to) target_neighbor_anchors = set() for target_nd_to in self.graph['g'].G[target_nd]: if target_nd_to in self.L['g2f']['train']: target_neighbor_anchors.add(target_nd_to) cnt_common_neighbors = .0 AA_measure = .0 for sna in src_neighbor_anchors: for k in range(len(self.L['f2g']['train'][sna])): target_anchor_nd = self.L['f2g']['train'][sna][k] if target_anchor_nd in target_neighbor_anchors: cnt_common_neighbors += 1. AA_measure += 1. / np.log( (len(self.graph['f'].G[sna]) + len(self.graph[ 'g'].G[self.L['f2g']['train'][sna][k]])) / 2.) jaccard = cnt_common_neighbors/(len(self.graph['f'].G[src_nd])\ +len(self.graph['g'].G[target_nd])-cnt_common_neighbors+1e-6) yield [cnt_common_neighbors, jaccard, AA_measure] def __batch_iter(self, lbs, batch_size, neg_ratio, lookup_src, lookup_obj, src_lb_tag, obj_lb_tag): train_lb_src2obj = lbs['{}2{}'.format(src_lb_tag, obj_lb_tag)]['train'] train_lb_obj2src = lbs['{}2{}'.format(obj_lb_tag, src_lb_tag)]['train'] train_size = len(train_lb_src2obj) start_index = 0 end_index = min(start_index + batch_size, train_size) src_lb_keys = train_lb_src2obj.keys() obj_lb_keys = train_lb_obj2src.keys() shuffle_indices = np.random.permutation(np.arange(train_size)) while start_index < end_index: pos_src = list() pos_obj = list() neg_src = list() neg_obj = list() for i in range(start_index, end_index): idx = shuffle_indices[i] src_lb = src_lb_keys[idx] obj_lbs = train_lb_src2obj[src_lb] for obj_lb in obj_lbs: cur_neg_src = list() cur_neg_obj = list() for k in range(neg_ratio): rand_obj_lb = None while not rand_obj_lb or rand_obj_lb in cur_neg_obj or rand_obj_lb in obj_lbs: rand_obj_lb_idx = random.randint( 0, len(obj_lb_keys) - 1) rand_obj_lb = obj_lb_keys[rand_obj_lb_idx] cur_neg_src.append(src_lb) cur_neg_obj.append(rand_obj_lb) pos_src.append(src_lb) pos_obj.append(obj_lb) neg_src.append(cur_neg_src) neg_obj.append(cur_neg_obj) start_index = end_index end_index = min(start_index + batch_size, train_size) yield pos_src, pos_obj, neg_src, neg_obj def train(self): batches_f2g = list(self.__batch_iter(self.L, self.batch_size, self.neg_ratio\ , self.look_up['f'], self.look_up['g'], 'f', 'g')) n_batches = len(batches_f2g) X = list() Y = list() for i in range(n_batches): pos_src_f2g, pos_obj_f2g, neg_src_f2g, neg_obj_f2g = batches_f2g[i] if not len(pos_src_f2g) == len(pos_obj_f2g) and not len( neg_src_f2g) == len(neg_obj_f2g): self.logger.info( 'The input label file goes wrong as the file format.') continue pos_features = list( self.__get_pair_features(pos_src_f2g, pos_obj_f2g)) X.extend(pos_features) Y.extend([1 for m in range(len(pos_features))]) for k in range(self.neg_ratio): neg_features = list( self.__get_pair_features(neg_src_f2g[k], neg_obj_f2g[k])) X.extend(neg_features) Y.extend([-1 for m in range(len(neg_features))]) self.logger.info('Training Model...') self.clf.fit(X, Y) self.logger.info('Complete Training process...')
class _MNA(object): def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) if not isinstance(graph, dict): self.logger.error('The graph must contain src and target graphs.') return self.graph = graph self.lookup = dict() self.lookup['f'] = self.graph['f'].look_up_dict self.lookup['g'] = self.graph['g'].look_up_dict self.look_back = dict() self.look_back['f'] = self.graph['f'].look_back_list self.look_back['g'] = self.graph['g'].look_back_list self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop) self.neg_ratio = neg_ratio self.batch_size = 1024 self.clf = svm.SVC(probability=True) def __get_pair_features(self, src_nds, target_nds): pair_features = list() if len(src_nds) != len(target_nds): self.logger.warn( 'The size of sampling in processing __get_pair_features is not equal.' ) yield pair_features for i in range(len(src_nds)): src_nd, target_nd = src_nds[i], target_nds[i] src_neighbor_anchors = set() for src_nd_to in self.graph['f'].G[self.look_back['f'][src_nd]]: if src_nd_to in self.L['f2g']['train']: src_neighbor_anchors.add(src_nd_to) target_neighbor_anchors = set() for target_nd_to in self.graph['g'].G[self.look_back['g'] [target_nd]]: if target_nd_to in self.L['g2f']['train']: target_neighbor_anchors.add(target_nd_to) cnt_common_neighbors = .0 AA_measure = .0 for sna in src_neighbor_anchors: for k in range(len(self.L['f2g']['train'][sna])): target_anchor_nd = self.L['f2g']['train'][sna][k] if target_anchor_nd in target_neighbor_anchors: cnt_common_neighbors += 1. AA_measure += 1./np.log((len(self.graph['f'].G[sna])\ +len(self.graph['g'].G[self.L['f2g']['train'][sna][k]]))/2.) jaccard = cnt_common_neighbors/(len(self.graph['f'].G[self.look_back['f'][src_nd]])\ +len(self.graph['g'].G[self.look_back['g'][target_nd]])\ -cnt_common_neighbors+1e-6) yield [cnt_common_neighbors, jaccard, AA_measure] def train(self): batches_f2g = batch_iter(self.L, self.batch_size, self.neg_ratio, self.lookup, 'f', 'g') X = list() Y = list() for batch in batches_f2g: pos, neg = batch if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len( neg['g']): self.logger.info( 'The input label file goes wrong as the file format.') continue pos_features = list(self.__get_pair_features(pos['f'], pos['g'])) X.extend(pos_features) Y.extend([1 for m in range(len(pos_features))]) for k in range(self.neg_ratio): neg_features = list( self.__get_pair_features(neg['f'][k], neg['g'][k])) X.extend(neg_features) Y.extend([-1 for m in range(len(neg_features))]) self.logger.info('Training Model...') self.clf.fit(X, Y) self.logger.info('Training score: %f' % self.clf.score(X, Y)) self.logger.info('Complete Training process...')