Beispiel #1
0
class _MNA(object):
    def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        if not isinstance(graph, dict):
            self.logger.error('The graph must contain src and target graphs.')
            return

        self.L = load_train_valid_labels(anchorfile, valid_prop)
        self.graph = graph
        self.look_up = dict()
        self.look_up['f'] = self.graph['f'].look_up_dict
        self.look_up['g'] = self.graph['g'].look_up_dict
        self.look_back = dict()
        self.look_back['f'] = self.graph['f'].look_back_list
        self.look_back['g'] = self.graph['g'].look_back_list

        self.neg_ratio = neg_ratio
        self.batch_size = 1024

        self.clf = svm.SVC()

    def __get_pair_features(self, src_nds, target_nds):
        pair_features = list()
        if len(src_nds) != len(target_nds):
            self.logger.warn(
                'The size of sampling in processing __get_pair_features is not equal.'
            )
            yield pair_features
        for i in range(len(src_nds)):
            src_nd, target_nd = src_nds[i], target_nds[i]

            if not src_nd in self.graph['f'].G or not target_nd in self.graph[
                    'g'].G:
                continue

            src_neighbor_anchors = set()
            for src_nd_to in self.graph['f'].G[src_nd]:
                if src_nd_to in self.L['f2g']['train']:
                    src_neighbor_anchors.add(src_nd_to)

            target_neighbor_anchors = set()
            for target_nd_to in self.graph['g'].G[target_nd]:
                if target_nd_to in self.L['g2f']['train']:
                    target_neighbor_anchors.add(target_nd_to)

            cnt_common_neighbors = .0
            AA_measure = .0
            for sna in src_neighbor_anchors:
                for k in range(len(self.L['f2g']['train'][sna])):
                    target_anchor_nd = self.L['f2g']['train'][sna][k]
                    if target_anchor_nd in target_neighbor_anchors:
                        cnt_common_neighbors += 1.
                        AA_measure += 1. / np.log(
                            (len(self.graph['f'].G[sna]) + len(self.graph[
                                'g'].G[self.L['f2g']['train'][sna][k]])) / 2.)
            jaccard = cnt_common_neighbors/(len(self.graph['f'].G[src_nd])\
                    +len(self.graph['g'].G[target_nd])-cnt_common_neighbors+1e-6)

            yield [cnt_common_neighbors, jaccard, AA_measure]

    def __batch_iter(self, lbs, batch_size, neg_ratio, lookup_src, lookup_obj,
                     src_lb_tag, obj_lb_tag):
        train_lb_src2obj = lbs['{}2{}'.format(src_lb_tag, obj_lb_tag)]['train']
        train_lb_obj2src = lbs['{}2{}'.format(obj_lb_tag, src_lb_tag)]['train']
        train_size = len(train_lb_src2obj)
        start_index = 0
        end_index = min(start_index + batch_size, train_size)

        src_lb_keys = train_lb_src2obj.keys()
        obj_lb_keys = train_lb_obj2src.keys()
        shuffle_indices = np.random.permutation(np.arange(train_size))
        while start_index < end_index:
            pos_src = list()
            pos_obj = list()
            neg_src = list()
            neg_obj = list()
            for i in range(start_index, end_index):
                idx = shuffle_indices[i]
                src_lb = src_lb_keys[idx]
                obj_lbs = train_lb_src2obj[src_lb]
                for obj_lb in obj_lbs:
                    cur_neg_src = list()
                    cur_neg_obj = list()
                    for k in range(neg_ratio):
                        rand_obj_lb = None
                        while not rand_obj_lb or rand_obj_lb in cur_neg_obj or rand_obj_lb in obj_lbs:
                            rand_obj_lb_idx = random.randint(
                                0,
                                len(obj_lb_keys) - 1)
                            rand_obj_lb = obj_lb_keys[rand_obj_lb_idx]
                        cur_neg_src.append(src_lb)
                        cur_neg_obj.append(rand_obj_lb)
                    pos_src.append(src_lb)
                    pos_obj.append(obj_lb)
                    neg_src.append(cur_neg_src)
                    neg_obj.append(cur_neg_obj)

            start_index = end_index
            end_index = min(start_index + batch_size, train_size)

            yield pos_src, pos_obj, neg_src, neg_obj

    def train(self):

        batches_f2g = list(self.__batch_iter(self.L, self.batch_size, self.neg_ratio\
              , self.look_up['f'], self.look_up['g'], 'f', 'g'))
        n_batches = len(batches_f2g)

        X = list()
        Y = list()
        for i in range(n_batches):
            pos_src_f2g, pos_obj_f2g, neg_src_f2g, neg_obj_f2g = batches_f2g[i]
            if not len(pos_src_f2g) == len(pos_obj_f2g) and not len(
                    neg_src_f2g) == len(neg_obj_f2g):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            pos_features = list(
                self.__get_pair_features(pos_src_f2g, pos_obj_f2g))
            X.extend(pos_features)
            Y.extend([1 for m in range(len(pos_features))])

            for k in range(self.neg_ratio):
                neg_features = list(
                    self.__get_pair_features(neg_src_f2g[k], neg_obj_f2g[k]))
                X.extend(neg_features)
                Y.extend([-1 for m in range(len(neg_features))])

            self.logger.info('Training Model...')
            self.clf.fit(X, Y)
            self.logger.info('Complete Training process...')
Beispiel #2
0
def main(args):
    t1 = time.time()

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id

    # args.use_net=False
    logger = LogHandler('RUN.' +
                        time.strftime('%Y-%m-%d', time.localtime(time.time())))
    logger.info(args)

    SAVING_STEP = args.saving_step
    MAX_EPOCHS = args.epochs
    if args.method == 'pale':
        model = PALE(learning_rate=args.lr,
                     batch_size=args.batch_size,
                     n_input=args.input_size,
                     n_hidden=args.hidden_size,
                     n_layer=args.layers,
                     files=args.embeddings + args.identity_linkage,
                     type_model=args.type_model,
                     is_valid=args.is_valid,
                     log_file=args.log_file,
                     device=args.device)
        losses = np.zeros(MAX_EPOCHS)
        val_scrs = np.zeros(MAX_EPOCHS)
        best_scr = .0
        best_epoch = 0
        thres = 100
        for i in range(1, MAX_EPOCHS + 1):
            losses[i - 1], val_scrs[i - 1] = model.train_one_epoch()
            if i > 0 and i % SAVING_STEP == 0:
                loss_mean = np.mean(losses[i - SAVING_STEP:i])
                scr_mean = np.mean(val_scrs[i - SAVING_STEP:i])
                logger.info(
                    'loss in last {} epoches: {}, validation in last {} epoches: {}'
                    .format(SAVING_STEP, loss_mean, SAVING_STEP, scr_mean))
                if scr_mean > best_scr:
                    best_scr = scr_mean
                    best_epoch = i
                    model.save_models(args.output)
                if args.early_stop and i >= thres * SAVING_STEP:
                    cnt = 0
                    for k in range(thres - 1, -1, -1):
                        cur_val = np.mean(
                            val_scrs[i - (k + 1) * SAVING_STEP:i -
                                     k * SAVING_STEP])
                        if cur_val < best_scr:
                            cnt += 1
                    if cnt == thres and (i -
                                         best_epoch) >= thres * SAVING_STEP:
                        logger.info('*********early stop*********')
                        logger.info(
                            'The best epoch: {}\nThe validation score: {}'.
                            format(best_epoch, best_scr))
                        break
    if args.method == 'mna' or args.method == 'fruip':
        graph = defaultdict(Graph)
        print("Loading graph...")
        if len(args.graphs) != 2:
            logger.error('#####The input graphs must be pairwise!#####')
            sys.exit(1)
        if args.graph_format == 'adjlist':
            if args.graphs[0]:
                graph['f'].read_adjlist(filename=args.graphs[0])
            if args.graphs[1]:
                graph['g'].read_adjlist(filename=args.graphs[1])
        if args.graph_format == 'edgelist':
            if args.graphs[0]:
                graph['f'].read_edgelist(filename=args.graphs[0])
            if args.graphs[1]:
                graph['g'].read_edgelist(filename=args.graphs[1])

        if args.method == 'mna':
            model = MNA(graph=graph, attr_file=args.embeddings, anchorfile=args.identity_linkage, valid_prop=1.\
                        , use_net=args.use_net, neg_ratio=args.neg_ratio, log_file=args.log_file)
        if args.method == 'fruip':
            model = FRUIP(graph=graph,
                          embed_files=args.embeddings,
                          linkage_file=args.identity_linkage)
            model.main_proc(args.threshold)
    if args.method == 'final':
        main_proc(graph_files=args.graphs,
                  graph_sizes=args.graph_sizes,
                  linkage_file=args.identity_linkage,
                  alpha=args.alpha,
                  epoch=args.epochs,
                  tol=args.tol,
                  graph_format=args.graph_format,
                  output_file=args.output)
    if args.method == 'crossmna':
        num_graphs = len(args.graphs)
        layer_graphs = [Graph() for i in range(num_graphs)]
        for k in range(num_graphs):
            graph_path = args.graphs[k]
            format_graph_path = '{}.crossmna'.format(graph_path)
            format_crossmna_graph(graph_path, format_graph_path, k)
            if args.graph_format == 'adjlist':
                layer_graphs[k].read_adjlist(filename=format_graph_path)
            if args.graph_format == 'edgelist':
                layer_graphs[k].read_edgelist(filename=format_graph_path)
        model = CROSSMNA(layer_graphs=layer_graphs,
                         anchor_file=args.identity_linkage,
                         lr=args.lr,
                         batch_size=args.batch_size,
                         nd_rep_size=args.nd_rep_size,
                         layer_rep_size=args.layer_rep_size,
                         epoch=args.epochs,
                         negative_ratio=args.neg_ratio,
                         table_size=args.table_size,
                         outfile=args.output,
                         log_file=args.log_file)
    if args.method in ['mna', 'fruip', 'pale']:
        model.save_model(args.output)
    t2 = time.time()
    print('time cost:', t2 - t1)
Beispiel #3
0
class _MNA(object):
    def __init__(self, graph, attr_file, anchorfile, use_net, valid_prop,
                 neg_ratio, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        if not isinstance(graph, dict):
            self.logger.error('The graph must contain src and target graphs.')
            return

        self.use_net = use_net
        self.graph = graph
        self.lookup = dict()
        self.lookup['f'] = self.graph['f'].look_up_dict
        self.lookup['g'] = self.graph['g'].look_up_dict
        self.look_back = dict()
        self.look_back['f'] = self.graph['f'].look_back_list
        self.look_back['g'] = self.graph['g'].look_back_list
        self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop)

        self.attributes = dict()
        if attr_file:
            self.attributes['f'] = self._set_node_attributes(attr_file[0])
            self.attributes['g'] = self._set_node_attributes(attr_file[1])

        self.neg_ratio = neg_ratio
        self.batch_size = 1024

        self.clf = svm.SVC(probability=True)

    def _set_node_attributes(self, attr_file):
        node_attributes = defaultdict(list)
        if not attr_file:
            return None
        with open(attr_file, 'r') as fin:
            for ln in fin:
                elems = ln.strip().split(',')
                node_attributes[elems[0]] = list(map(float, elems[1:]))
        return node_attributes

    def _get_pair_features(self, src_nds, target_nds):
        pair_features = list()
        if len(src_nds) != len(target_nds):
            self.logger.warn(
                'The size of sampling in processing _get_pair_features is not equal.'
            )
            yield pair_features
        for i in range(len(src_nds)):
            src_nd_idx, target_nd_idx = src_nds[i], target_nds[i]
            src_nd = self.look_back['f'][src_nd_idx]
            target_nd = self.look_back['g'][target_nd_idx]

            src_neighbor_anchors = set()
            for src_nd_to in self.graph['f'].G[src_nd]:
                if src_nd_to in self.L['f2g']['train']:
                    src_neighbor_anchors.add(src_nd_to)

            target_neighbor_anchors = set()
            for target_nd_to in self.graph['g'].G[target_nd]:
                if target_nd_to in self.L['g2f']['train']:
                    target_neighbor_anchors.add(target_nd_to)

            cnt_common_neighbors = .0
            AA_measure = .0
            for sna in src_neighbor_anchors:
                for k in range(len(self.L['f2g']['train'][sna])):
                    target_anchor_nd = self.L['f2g']['train'][sna][k]
                    if target_anchor_nd in target_neighbor_anchors:
                        cnt_common_neighbors += 1.
                        AA_measure += 1./np.log((len(self.graph['f'].G[sna])\
                                                +len(self.graph['g'].G[self.L['f2g']['train'][sna][k]]))/2.)
            jaccard = cnt_common_neighbors/(len(self.graph['f'].G[src_nd])\
                                            +len(self.graph['g'].G[target_nd])\
                                            -cnt_common_neighbors+1e-6)

            # print(self.attributes['f'][src_nd], self.attributes['g'][target_nd])
            feat_net = []
            feat_attr = []
            if self.use_net:
                feat_net = [cnt_common_neighbors, jaccard, AA_measure]
            if len(self.attributes) > 0:
                feat_len = len(self.attributes['f'][src_nd])
                feat_attr = [1-self.attributes['f'][src_nd][k]\
                                +self.attributes['g'][target_nd][k] for k in range(feat_len)]

            # print(len(feat_net), len(feat_attr))
            yield feat_net + feat_attr

    def train(self):

        batches_f2g = batch_iter(self.L, self.batch_size, self.neg_ratio,
                                 self.lookup, 'f', 'g')

        X = list()
        Y = list()
        for batch in batches_f2g:
            pos, neg = batch
            if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len(
                    neg['g']):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            pos_features = list(self._get_pair_features(pos['f'], pos['g']))
            # print('feat_len (pos):',len(pos_features[0]))
            X.extend(pos_features)
            Y.extend([1 for m in range(len(pos_features))])

            for k in range(self.neg_ratio):
                neg_features = list(
                    self._get_pair_features(neg['f'][k], neg['g'][k]))
                X.extend(neg_features)
                # print('feat_len (neg):',len(neg_features[0]))
                Y.extend([-1 for m in range(len(neg_features))])

            self.logger.info('Training Model...')
            print(len(X), len(X[0]), len(Y))
            self.clf.fit(X, Y)
            print(self.clf)
            self.logger.info('Training score: %f' % self.clf.score(X, Y))
            self.logger.info('Complete Training process...')
Beispiel #4
0
class _LINE_ANCHORREG_ALIGN_PRETRAIN(object):

    def __init__(self, graph, lr=.001, gamma=.1, rep_size=128, batch_size=100, negative_ratio=5\
                    , order=3, table_size=1e8, embedfile=None, anchorfile=None, log_file='log'):

        if not embedfile or not anchorfile:
            return

        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.epsilon = 1e-7
        self.table_size = table_size
        self.sigmoid_table = {}
        self.sigmoid_table_size = 1000
        self.SIGMOID_BOUND = 6

        self._init_simgoid_table()

        self.g = graph
        self.look_up = self.g.look_up_dict
        self.idx = defaultdict(int)
        self.update_dict = defaultdict(dict)
        self.update_look_back = defaultdict(list)

        self.node_size = graph.G.number_of_nodes()
        self.rep_size = rep_size

        self.order = order
        self.lr = lr
        self.gamma = gamma
        self.cur_epoch = 0
        self.batch_size = batch_size
        self.negative_ratio = negative_ratio

        self._gen_sampling_table()
        self._init_params(self.node_size, rep_size, embedfile, anchorfile)

    def _init_params(self, node_size, rep_size, embedfile, anchorfile):
        self.embeddings = dict()
        self.embeddings['order1'] = np.random.normal(0, 1,
                                                     (node_size, rep_size))
        self.embeddings['order2'] = np.random.normal(0, 1,
                                                     (node_size, rep_size))
        self.embeddings['content'] = np.random.normal(0, 1,
                                                      (node_size, rep_size))
        self.embeddings['order1'] = self._set_anchor_nds(
            self.embeddings['order1'], embedfile, anchorfile, 1)
        self.embeddings['order2'] = self._set_anchor_nds(
            self.embeddings['order2'], embedfile, anchorfile, 2)
        self._init_update_params(node_size, rep_size)
        self._pre_train()

    def _init_update_params(self, node_size, rep_size):
        # adagrad
        self.h_delta = dict()
        self.h_delta['order1'] = np.zeros((node_size, rep_size))
        self.h_delta['order2'] = np.zeros((node_size, rep_size))
        self.h_delta['content'] = np.zeros((node_size, rep_size))
        # adam
        self.m = dict()
        self.m['order1'] = np.zeros((node_size, rep_size))
        self.m['order2'] = np.zeros((node_size, rep_size))
        self.m['content'] = np.zeros((node_size, rep_size))
        self.v = dict()
        self.v['order1'] = np.zeros((node_size, rep_size))
        self.v['order2'] = np.zeros((node_size, rep_size))
        self.v['content'] = np.zeros((node_size, rep_size))
        self.t = 1

    def _read_anchors(self, anchorfile):
        anchors = list()
        with open(anchorfile, 'r') as anchor_handler:
            for ln in anchor_handler:
                elems = ln.strip().split()
                anchors.append((elems[0], elems[1]))
        return anchors

    def _read_embeddings(self, embedfile):
        embeddings = dict()
        with open(embedfile, 'r') as embed_handler:
            for ln in embed_handler:
                elems = ln.strip().split()
                if len(elems) <= 2:
                    continue
                embeddings[elems[0]] = map(float, elems[1:])
        return embeddings

    def _set_anchor_nds(self, mat, embedfile, anchorfile, order):
        self.anchors = self._read_anchors(anchorfile)
        self.src_embeddings = self._read_embeddings(embedfile)
        self.anchor_idx = set()
        for src_nd, target_nd in self.anchors:
            if not target_nd in self.look_up or not src_nd in self.src_embeddings:
                continue
            if len(mat[self.look_up[target_nd]]) != len(
                    self.src_embeddings[src_nd]):
                self.logger.error(
                    'The length of embeddings at anchor nodes are illegal')
                break
            self.anchor_idx.add(self.look_up[target_nd])
            # if order==1:
            #     mat[self.look_up[target_nd]] = self.src_embeddings[src_nd][0:len(mat[self.look_up[target_nd]])]
            # if order==2:
            #     mat[self.look_up[target_nd]] = self.src_embeddings[src_nd][len(mat[self.look_up[target_nd]]):]
            mat[self.look_up[target_nd]] = self.src_embeddings[src_nd]
        return mat

    def _pre_train(self):
        self.logger.info("Pretraining...")
        DISPLAY_EPOCH = 1000
        order = self.order
        batches = self.batch_iter()
        opt_type = 'adam'
        for batch in batches:
            self.idx = defaultdict(int)
            self.update_look_back = defaultdict(list)
            self.update_dict = defaultdict(dict)
            if order == 1 or order == 3:
                delta_eh_o1 = self._pretrain_update_graph_by_order1(batch)
                len_delta = len(delta_eh_o1)
                # print 'order1 nd'
                if opt_type == 'adagrad':
                    self.h_delta['order1'], self.embeddings['order1'] = \
                                    self.update_vec('nd_order1', self.h_delta['order1'], delta_eh_o1
                                                    , self.embeddings['order1'], len_delta, self.t)
                if opt_type == 'adam':
                    self.m['order1'], self.v['order1'], self.embeddings['order1'] = \
                                    self.update_vec_by_adam('nd_order1', self.m['order1'], self.v['order1'], delta_eh_o1
                                                    , self.embeddings['order1'], len_delta, self.t)
            if order == 2 or order == 3:
                delta_c, delta_eh_o2 = self._pretrain_update_graph_by_order2(
                    batch)
                len_delta = len(delta_eh_o2)
                # print 'order2, nd'
                if opt_type == 'adagrad':
                    self.h_delta['order2'], self.embeddings['order2'] = \
                                        self.update_vec('nd_order2', self.h_delta['order2'], delta_eh_o2
                                                        , self.embeddings['order2'], len_delta, self.t)
                if opt_type == 'adam':
                    self.m['order2'], self.v['order2'], self.embeddings['order2'] = \
                                    self.update_vec_by_adam('nd_order2', self.m['order2'], self.v['order2'], delta_eh_o2
                                                    , self.embeddings['order2'], len_delta, self.t)
                len_content = len(delta_c)
                # print 'order2, content'
                if opt_type == 'adagrad':
                    self.h_delta_c, self.embeddings['content'] = \
                                        self.update_vec('cnt_order2', self.h_delta['content'], delta_c
                                                        , self.embeddings['content'], len_content, self.t)
                if opt_type == 'adam':
                    self.m['content'], self.v['content'], self.embeddings['content'] = \
                                    self.update_vec_by_adam('cnt_order2', self.m['content'], self.v['content'], delta_c
                                                    , self.embeddings['content'], len_content, self.t)
                # self.embeddings_order2[self.update_look_back[:len_de],:] -= self.lr*delta_eh
                # len_content = len(delta_c)
                # self.content_embeddings[self.update_look_back[:len_content],:] -= self.lr*delta_c
                # break
            if (self.t - 1) % DISPLAY_EPOCH == 0:
                self.get_cur_batch_loss(self.t, batch)
            self.t += 1
        self._init_update_params(self.node_size, self.rep_size)
        self.logger.info("End of Pretraining")

    def _init_simgoid_table(self):
        for k in range(self.sigmoid_table_size):
            x = 2 * self.SIGMOID_BOUND * k / self.sigmoid_table_size - self.SIGMOID_BOUND
            self.sigmoid_table[k] = 1. / (1 + np.exp(-x))

    def _fast_sigmoid(self, val):
        if val > self.SIGMOID_BOUND:
            return 1 - self.epsilon
        elif val < -self.SIGMOID_BOUND:
            return self.epsilon
        k = int((val + self.SIGMOID_BOUND) * self.sigmoid_table_size /
                self.SIGMOID_BOUND / 2)
        return self.sigmoid_table[k]
        # return 1./(1+np.exp(-val))

    def _pretrain_update_graph_by_order2(self, batch):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        pos_h, pos_t, pos_h_v, neg_t = batch
        batch_size = len(pos_h)
        # print pos_h, pos_t, pos_h_v, neg_t

        # order 2
        pos_u = self.embeddings['order2'][pos_h, :]
        pos_v_c = self.embeddings['content'][pos_t, :]
        neg_u = self.embeddings['order2'][pos_h_v, :]
        neg_v_c = self.embeddings['content'][neg_t, :]

        pos_e = np.sum(pos_u * pos_v_c, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v_c,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        # temporal delta
        delta_eh = list()
        delta_c = list()

        for i in range(len(pos_t)):
            u, v = pos_h[i], pos_t[i]
            if not v in self.anchor_idx:
                delta_c = self._calc_delta_vec('cnt_order2', v, delta_c,
                                               (sigmoid_pos_e[i] - 1) *
                                               pos_u[i, :])
            if not u in self.anchor_idx:
                delta_eh = self._calc_delta_vec('nd_order2', u, delta_eh,
                                                (sigmoid_pos_e[i] - 1) *
                                                pos_v_c[i, :])
            # print 'delta_eh',delta_eh,ndDict_order
        neg_shape = neg_e.shape
        for i in range(neg_shape[0]):
            for j in range(neg_shape[1]):
                u, v = pos_h_v[i][j], neg_t[i][j]
                if not v in self.anchor_idx:
                    delta_c = self._calc_delta_vec(
                        'cnt_order2', v, delta_c,
                        sigmoid_neg_e[i, j] * neg_u[i, j, :])
                if not u in self.anchor_idx:
                    delta_eh = self._calc_delta_vec(
                        'nd_order2', u, delta_eh,
                        sigmoid_neg_e[i, j] * neg_v_c[i, j, :])
                # print sigmoid_neg_e[i,j]*neg_v_c[i,j,:], type(sigmoid_neg_e[i,j]*neg_v_c[i,j,:])
                # print 'delta_eh',delta_eh,ndDict_order

        # delta x & delta codebook
        delta_eh = self._format_vec('nd_order2', delta_eh)
        delta_c = self._format_vec('cnt_order2', delta_c)

        return delta_c / batch_size, delta_eh / batch_size

    def _pretrain_update_graph_by_order1(self, batch):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        pos_h, pos_t, pos_h_v, neg_t = batch
        batch_size = len(pos_h)

        # order 1
        pos_u = self.embeddings['order1'][pos_h, :]
        pos_v = self.embeddings['order1'][pos_t, :]
        neg_u = self.embeddings['order1'][pos_h_v, :]
        neg_v = self.embeddings['order1'][neg_t, :]

        pos_e = np.sum(pos_u * pos_v, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        # delta calculation
        delta_eh = list()

        for i in range(len(pos_t)):
            u, v = pos_h[i], pos_t[i]
            if not v in self.anchor_idx:
                delta_eh = self._calc_delta_vec('nd_order1', v, delta_eh,
                                                (sigmoid_pos_e[i] - 1) *
                                                pos_u[i, :])
            if not u in self.anchor_idx:
                delta_eh = self._calc_delta_vec('nd_order1', u, delta_eh,
                                                (sigmoid_pos_e[i] - 1) *
                                                pos_v[i, :])
        neg_shape = neg_e.shape
        for i in range(neg_shape[0]):
            for j in range(neg_shape[1]):
                u, v = pos_h_v[i][j], neg_t[i][j]
                if not v in self.anchor_idx:
                    delta_eh = self._calc_delta_vec(
                        'nd_order1', v, delta_eh,
                        sigmoid_neg_e[i, j] * neg_u[i, j, :])
                if not u in self.anchor_idx:
                    delta_eh = self._calc_delta_vec(
                        'nd_order1', u, delta_eh,
                        sigmoid_neg_e[i, j] * neg_v[i, j, :])

        # delta x & delta codebook
        delta_eh = self._format_vec('nd_order1', delta_eh)

        return delta_eh / batch_size

    def _cos_sim(self, vec1, vec2):
        return np.dot(vec1, vec2) / np.linalg.norm(vec1) / np.linalg.norm(vec2)

    def _update_graph_by_anchor_reg(self):

        delta_eh = list()

        cnt = 0
        for src_nd, target_nd in self.anchors:
            if not src_nd in self.src_embeddings or not target_nd in self.look_up:
                continue
            src_emb = np.array(self.src_embeddings[src_nd])
            if self.order == 2:
                target_emb = self.embeddings['order2'][self.look_up[target_nd]]
            if self.order == 1:
                target_emb = self.embeddings['order1'][self.look_up[target_nd]]
            delta_eh = self._calc_delta_vec(
                'nd_order2', self.look_up[target_nd], delta_eh,
                (self._cos_sim(src_emb, target_emb) * target_emb /
                 np.dot(target_emb, target_emb) - src_emb /
                 np.linalg.norm(src_emb) / np.linalg.norm(target_emb)) /
                self._cos_sim(src_emb, target_emb))
            cnt += 1

        if self.order == 2:
            delta_eh = self._format_vec('nd_order2', delta_eh)
        if self.order == 1:
            delta_eh = self._format_vec('nd_order1', delta_eh)

        return delta_eh / cnt

    def _format_vec(self, cal_type, vec):
        len_gap = self.idx[cal_type] - len(vec)
        if len_gap > 0:
            for i in range(len_gap):
                if isinstance(vec, list):
                    vec.append(np.zeros(vec[0].shape))
                else:
                    vec = np.vstack((vec, np.zeros(vec[0].shape)))
        return np.array(vec)

    def _calc_delta_vec(self, cal_type, nd, delta, opt_vec):
        if nd not in self.update_dict[cal_type]:
            cur_idx = self.idx[cal_type]
            self.update_dict[cal_type][nd] = cur_idx
            self.update_look_back[cal_type].append(nd)
            self.idx[cal_type] += 1
        else:
            cur_idx = self.update_dict[cal_type][nd]
        if cur_idx >= len(delta):
            for i in range(cur_idx - len(delta)):
                delta.append(np.zeros(opt_vec.shape))
            delta.append(opt_vec)
        else:
            delta[cur_idx] += opt_vec
        return delta

    def _update_graph_by_order2(self, batch):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        pos_h, pos_t, pos_h_v, neg_t = batch
        batch_size = len(pos_h)
        # print pos_h, pos_t, pos_h_v, neg_t

        # order 2
        pos_u = self.embeddings['order2'][pos_h, :]
        pos_v_c = self.embeddings['content'][pos_t, :]
        neg_u = self.embeddings['order2'][pos_h_v, :]
        neg_v_c = self.embeddings['content'][neg_t, :]

        pos_e = np.sum(pos_u * pos_v_c, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v_c,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        # temporal delta
        delta_eh = list()
        delta_c = list()

        for i in range(len(pos_t)):
            u, v = pos_h[i], pos_t[i]
            delta_c = self._calc_delta_vec(
                'cnt_order2', v, delta_c, (sigmoid_pos_e[i] - 1) * pos_u[i, :])
            delta_eh = self._calc_delta_vec('nd_order2', u, delta_eh,
                                            (sigmoid_pos_e[i] - 1) *
                                            pos_v_c[i, :])
            # print 'delta_eh',delta_eh,ndDict_order
        neg_shape = neg_e.shape
        for i in range(neg_shape[0]):
            for j in range(neg_shape[1]):
                u, v = pos_h_v[i][j], neg_t[i][j]
                delta_c = self._calc_delta_vec(
                    'cnt_order2', v, delta_c,
                    sigmoid_neg_e[i, j] * neg_u[i, j, :])
                delta_eh = self._calc_delta_vec(
                    'nd_order2', u, delta_eh,
                    sigmoid_neg_e[i, j] * neg_v_c[i, j, :])
                # print sigmoid_neg_e[i,j]*neg_v_c[i,j,:], type(sigmoid_neg_e[i,j]*neg_v_c[i,j,:])
                # print 'delta_eh',delta_eh,ndDict_order

        # delta x & delta codebook
        delta_eh = self._format_vec('nd_order2', delta_eh)
        delta_c = self._format_vec('cnt_order2', delta_c)

        return delta_c / batch_size, delta_eh / batch_size

    def _update_graph_by_order1(self, batch):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        pos_h, pos_t, pos_h_v, neg_t = batch
        batch_size = len(pos_h)

        # order 1
        pos_u = self.embeddings['order1'][pos_h, :]
        pos_v = self.embeddings['order1'][pos_t, :]
        neg_u = self.embeddings['order1'][pos_h_v, :]
        neg_v = self.embeddings['order1'][neg_t, :]

        pos_e = np.sum(pos_u * pos_v, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        # delta calculation
        delta_eh = list()

        for i in range(len(pos_t)):
            u, v = pos_h[i], pos_t[i]
            delta_eh = self._calc_delta_vec(
                'nd_order1', v, delta_eh, (sigmoid_pos_e[i] - 1) * pos_u[i, :])
            delta_eh = self._calc_delta_vec(
                'nd_order1', u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v[i, :])
        neg_shape = neg_e.shape
        for i in range(neg_shape[0]):
            for j in range(neg_shape[1]):
                u, v = pos_h_v[i][j], neg_t[i][j]
                delta_eh = self._calc_delta_vec(
                    'nd_order1', v, delta_eh,
                    sigmoid_neg_e[i, j] * neg_u[i, j, :])
                delta_eh = self._calc_delta_vec(
                    'nd_order1', u, delta_eh,
                    sigmoid_neg_e[i, j] * neg_v[i, j, :])

        # delta x & delta codebook
        delta_eh = self._format_vec('nd_order1', delta_eh)

        return delta_eh / batch_size

    def _mat_add(self, mat1, mat2):
        # print '****mat add****'
        # print mat1, mat2
        len_gap = len(mat1) - len(mat2)
        # print len_gap
        if len_gap > 0:
            for i in range(len_gap):
                mat2 = np.vstack((mat2, np.zeros(mat2[0, :].shape)))
                # print mat2
        else:
            for i in range(-len_gap):
                mat1 = np.vstack((mat1, np.zeros(mat1[0, :].shape)))
        #         print mat1
        # print len(mat1), len(mat2)
        return mat1 + mat2

    def get_anchor_reg_loss(self):

        cos_sim_list = list()

        for src_nd, target_nd in self.anchors:
            if not src_nd in self.src_embeddings or not target_nd in self.look_up:
                continue
            src_emb = np.array(self.src_embeddings[src_nd])
            target_emb = self.embeddings['order2'][self.look_up[target_nd]]
            cos_sim_list.append(self._cos_sim(src_emb, target_emb))

        return -np.mean(cos_sim_list)

    def get_graph_loss_by_order2(self, batch):
        pos_h, pos_t, pos_h_v, neg_t = batch

        # order 2
        pos_u = self.embeddings['order2'][pos_h, :]
        pos_v_c = self.embeddings['content'][pos_t, :]
        neg_u = self.embeddings['order2'][pos_h_v, :]
        neg_v_c = self.embeddings['content'][neg_t, :]

        pos_e = np.sum(pos_u * pos_v_c, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v_c,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        return -np.mean(
            np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1))

    def get_graph_loss_by_order1(self, batch):
        pos_h, pos_t, pos_h_v, neg_t = batch

        # order 2
        pos_u = self.embeddings['order1'][pos_h, :]
        pos_v = self.embeddings['order1'][pos_t, :]
        neg_u = self.embeddings['order1'][pos_h_v, :]
        neg_v = self.embeddings['order1'][neg_t, :]

        # pos_e_1 = np.sum(pos_u*pos_v, axis=1)+np.sum(self.b_e[key][0][pos_t,:], axis=1) # pos_e.shape = batch_size
        # neg_e_1 = np.sum(neg_u*neg_v, axis=2)+np.sum(self.b_e[key][0][neg_t,:], axis=2) # neg_e.shape = batch_size*negative_ratio
        pos_e = np.sum(pos_u * pos_v, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        return -np.mean(
            np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1))

    def get_cur_batch_loss(self, t, batch):
        DISPLAY_EPOCH = 1
        if t % DISPLAY_EPOCH == 0:
            loss_order_1 = 0.0
            loss_order_2 = 0.0
            if self.order == 1 or self.order == 3:
                loss_order_1 += self.get_graph_loss_by_order1(batch)
            if self.order == 2 or self.order == 3:
                anchor_loss = self.get_anchor_reg_loss()
                loss_order_2 += self.get_graph_loss_by_order2(
                    batch) + anchor_loss
            if self.order == 1:
                self.logger.info(
                    'Finish processing batch {} and loss from order 1:{}'.
                    format(t, loss_order_1))
            elif self.order == 2:
                self.logger.info(
                    'Finish processing batch {} and loss from order 2:{} and anchor loss:{}'
                    .format(t, loss_order_2, anchor_loss))
            elif self.order == 3:
                self.logger.info(
                    'Finish processing batch {} and loss from order 3:{}'.
                    format(t, loss_order_1 + loss_order_2))

    def update_vec(self, cal_type, h_delta, delta, embeddings, len_delta, t):
        h_delta[self.update_look_back[cal_type][:len_delta], :] += delta**2
        # print 'original embedding:',embeddings[self.update_look_back[cal_type][:len_delta]]
        embeddings[self.update_look_back[cal_type][:len_delta],:] -= \
                                self.lr/np.sqrt(h_delta[self.update_look_back[cal_type][:len_delta],:])*delta
        # print 'delta:',delta
        # print 'h_delta:',h_delta[self.update_look_back[cal_type][:len_delta]]
        # print 'embeddings:',embeddings[self.update_look_back[cal_type][:len_delta]]
        # print 'lmd_rda:',elem_lbd
        return h_delta, embeddings

    def update_vec_by_adam(self, cal_type, m, v, delta, embeddings, len_delta,
                           t):
        self.beta1 = .9
        self.beta2 = .999
        m[self.update_look_back[cal_type][:len_delta],:] = \
            self.beta1*m[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta1)*delta
        v[self.update_look_back[cal_type][:len_delta],:] = \
            self.beta2*v[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta2)*(delta**2)
        m_ = m[self.update_look_back[cal_type][:len_delta], :] / (
            1 - self.beta1**t)
        v_ = v[self.update_look_back[cal_type][:len_delta], :] / (
            1 - self.beta2**t)

        embeddings[
            self.update_look_back[cal_type][:len_delta], :] -= self.lr * m_ / (
                np.sqrt(v_) + self.epsilon)

        return m, v, embeddings

    def train_one_epoch(self):
        DISPLAY_EPOCH = 1000
        order = self.order
        batches = self.batch_iter()
        opt_type = 'adam'
        for batch in batches:
            self.idx = defaultdict(int)
            self.update_look_back = defaultdict(list)
            self.update_dict = defaultdict(dict)
            if order == 1 or order == 3:
                delta_eh_o1 = self._update_graph_by_order1(batch)
                len_delta = len(delta_eh_o1)
                # print 'order1 nd'
                if opt_type == 'adagrad':
                    self.h_delta['order1'], self.embeddings['order1'] = \
                                    self.update_vec('nd_order1', self.h_delta['order1'], delta_eh_o1
                                                    , self.embeddings['order1'], len_delta, self.t)
                if opt_type == 'adam':
                    self.m['order1'], self.v['order1'], self.embeddings['order1'] = \
                                    self.update_vec_by_adam('nd_order1', self.m['order1'], self.v['order1'], delta_eh_o1
                                                    , self.embeddings['order1'], len_delta, self.t)
            if order == 2 or order == 3:
                delta_c, delta_eh_o2 = self._update_graph_by_order2(batch)
                delta_eh_anchor_reg = self._update_graph_by_anchor_reg()
                delta_eh_o2 = self._format_vec('nd_order2', delta_eh_o2)
                len_delta = len(delta_eh_o2)
                # print 'order2, nd'
                if opt_type == 'adagrad':
                    self.h_delta['order2'], self.embeddings['order2'] = \
                                        self.update_vec('nd_order2', self.h_delta['order2']
                                                        , delta_eh_o2+self.gamma*delta_eh_anchor_reg
                                                        , self.embeddings['order2'], len_delta, self.t)
                if opt_type == 'adam':
                    self.m['order2'], self.v['order2'], self.embeddings['order2'] = \
                                    self.update_vec_by_adam('nd_order2', self.m['order2'], self.v['order2']
                                                    , delta_eh_o2+self.gamma*delta_eh_anchor_reg
                                                    , self.embeddings['order2'], len_delta, self.t)
                len_content = len(delta_c)
                # print 'order2, content'
                if opt_type == 'adagrad':
                    self.h_delta_c, self.embeddings['content'] = \
                                        self.update_vec('cnt_order2', self.h_delta['content'], delta_c
                                                        , self.embeddings['content'], len_content, self.t)
                if opt_type == 'adam':
                    self.m['content'], self.v['content'], self.embeddings['content'] = \
                                    self.update_vec_by_adam('cnt_order2', self.m['content'], self.v['content'], delta_c
                                                    , self.embeddings['content'], len_content, self.t)
                # self.embeddings_order2[self.update_look_back[:len_de],:] -= self.lr*delta_eh
                # len_content = len(delta_c)
                # self.content_embeddings[self.update_look_back[:len_content],:] -= self.lr*delta_c
                # break
            if (self.t - 1) % DISPLAY_EPOCH == 0:
                self.get_cur_batch_loss(self.t, batch)
            self.t += 1
        self.cur_epoch += 1

    def get_random_node_pairs(self, i, shuffle_indices, edges, edge_set,
                              numNodes):
        # balance the appearance of edges according to edge_prob
        if not random.random() < self.edge_prob[shuffle_indices[i]]:
            shuffle_indices[i] = self.edge_alias[shuffle_indices[i]]
        cur_h = edges[shuffle_indices[i]][0]
        head = cur_h * numNodes
        cur_t = edges[shuffle_indices[i]][1]
        cur_h_v = []
        cur_neg_t = []
        for j in range(self.negative_ratio):
            rn = self.sampling_table[random.randint(0, self.table_size - 1)]
            while head + rn in edge_set or cur_h == rn or rn in cur_neg_t:
                rn = self.sampling_table[random.randint(
                    0, self.table_size - 1)]
                idx = random.randint(0, self.table_size - 1)
            cur_h_v.append(cur_h)
            cur_neg_t.append(rn)
        return cur_h, cur_t, cur_h_v, cur_neg_t

    def batch_iter(self):

        numNodes = self.node_size

        edges = [(self.look_up[x[0]], self.look_up[x[1]])
                 for x in self.g.G.edges()]
        data_size = self.g.G.number_of_edges()
        edge_set = set([x[0] * numNodes + x[1] for x in edges])
        shuffle_indices = np.random.permutation(np.arange(data_size))

        start_index = 0
        end_index = min(start_index + self.batch_size, data_size)
        while start_index < data_size:
            ret = {}
            pos_h = []
            pos_t = []
            pos_h_v = []
            neg_t = []
            for i in range(start_index, end_index):
                cur_h, cur_t, cur_h_v, cur_neg_t = self.get_random_node_pairs(
                    i, shuffle_indices, edges, edge_set, numNodes)
                pos_h.append(cur_h)
                pos_t.append(cur_t)
                pos_h_v.append(cur_h_v)
                neg_t.append(cur_neg_t)
            ret = (pos_h, pos_t, pos_h_v, neg_t)

            start_index = end_index
            end_index = min(start_index + self.batch_size, data_size)

            yield ret

    def _gen_sampling_table(self):
        table_size = self.table_size
        power = 0.75
        numNodes = self.node_size

        print "Pre-procesing for non-uniform negative sampling!"
        self.node_degree = np.zeros(numNodes)  # out degree

        look_up = self.g.look_up_dict
        for edge in self.g.G.edges():
            self.node_degree[look_up[edge[0]]] += self.g.G[edge[0]][
                edge[1]]["weight"]

        norm = sum(
            [math.pow(self.node_degree[i], power) for i in range(numNodes)])

        self.sampling_table = np.zeros(int(table_size), dtype=np.uint32)

        p = 0
        i = 0
        for j in range(numNodes):
            p += float(math.pow(self.node_degree[j], power)) / norm
            while i < table_size and float(i) / table_size < p:
                self.sampling_table[i] = j
                i += 1

        data_size = self.g.G.number_of_edges()
        self.edge_alias = np.zeros(data_size, dtype=np.int32)
        self.edge_prob = np.zeros(data_size, dtype=np.float32)
        large_block = np.zeros(data_size, dtype=np.int32)
        small_block = np.zeros(data_size, dtype=np.int32)

        total_sum = sum([
            self.g.G[edge[0]][edge[1]]["weight"] for edge in self.g.G.edges()
        ])
        norm_prob = [
            self.g.G[edge[0]][edge[1]]["weight"] * data_size / total_sum
            for edge in self.g.G.edges()
        ]
        num_small_block = 0
        num_large_block = 0
        cur_small_block = 0
        cur_large_block = 0
        for k in range(data_size - 1, -1, -1):
            if norm_prob[k] < 1:
                small_block[num_small_block] = k
                num_small_block += 1
            else:
                large_block[num_large_block] = k
                num_large_block += 1
        while num_small_block and num_large_block:
            num_small_block -= 1
            cur_small_block = small_block[num_small_block]
            num_large_block -= 1
            cur_large_block = large_block[num_large_block]
            self.edge_prob[cur_small_block] = norm_prob[cur_small_block]
            self.edge_alias[cur_small_block] = cur_large_block
            norm_prob[cur_large_block] = norm_prob[
                cur_large_block] + norm_prob[cur_small_block] - 1
            if norm_prob[cur_large_block] < 1:
                small_block[num_small_block] = cur_large_block
                num_small_block += 1
            else:
                large_block[num_large_block] = cur_large_block
                num_large_block += 1

        while num_large_block:
            num_large_block -= 1
            self.edge_prob[large_block[num_large_block]] = 1
        while num_small_block:
            num_small_block -= 1
            self.edge_prob[small_block[num_small_block]] = 1

    def save_embeddings(self, outfile):
        vectors = self.get_vectors()
        for c in vectors.keys():
            if 'node_embeddings' in c or 'content_embeddings' in c:
                # outfile-[node_embeddings/content-embeddings]-[src/obj]
                fout = open('{}.{}'.format(outfile, c), 'w')
                node_num = len(vectors[c].keys())
                fout.write("{} {}\n".format(node_num, self.rep_size))
                for node, vec in vectors[c].items():
                    fout.write("{} {}\n".format(
                        node, ' '.join([str(x) for x in vec])))
                fout.close()
        if self.order == 3:
            fout = open('{}.node_embedding_all'.format(outfile), 'w')
            node_num = len(vectors[c].keys())
            fout.write("{} {}\n".format(node_num, self.rep_size * 2))
            for node, vec in vectors['node_embeddings_order1'].items():
                fout.write("{} {} {}\n".format(
                    node, ' '.join([str(x) for x in vec]), ' '.join([
                        str(x) for x in vectors['node_embeddings_order2'][node]
                    ])))
            fout.close()

    def get_one_embeddings(self, embeddings):
        vectors = dict()
        look_back = self.g.look_back_list
        for i, embedding in enumerate(embeddings):
            vectors[look_back[i]] = embedding
        return vectors

    def get_vectors(self):
        order = self.order
        ret = dict()
        node_embeddings_order1 = self.get_one_embeddings(
            self.embeddings['order1'])
        ret['node_embeddings_order1'] = node_embeddings_order1
        node_embeddings_order2 = self.get_one_embeddings(
            self.embeddings['order2'])
        ret['node_embeddings_order2'] = node_embeddings_order2

        if order == 2 or order == 3:
            content_embeddings = dict()
            content_embeddings = self.get_one_embeddings(
                self.embeddings['content'])
            ret['content_embeddings'] = content_embeddings

        return ret
Beispiel #5
0
class _MNA(object):
    def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        if not isinstance(graph, dict):
            self.logger.error('The graph must contain src and target graphs.')
            return

        self.graph = graph
        self.lookup = dict()
        self.lookup['f'] = self.graph['f'].look_up_dict
        self.lookup['g'] = self.graph['g'].look_up_dict
        self.look_back = dict()
        self.look_back['f'] = self.graph['f'].look_back_list
        self.look_back['g'] = self.graph['g'].look_back_list
        self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop)

        self.neg_ratio = neg_ratio
        self.batch_size = 1024

        self.clf = svm.SVC(probability=True)

    def __get_pair_features(self, src_nds, target_nds):
        pair_features = list()
        if len(src_nds) != len(target_nds):
            self.logger.warn(
                'The size of sampling in processing __get_pair_features is not equal.'
            )
            yield pair_features
        for i in range(len(src_nds)):
            src_nd, target_nd = src_nds[i], target_nds[i]

            src_neighbor_anchors = set()
            for src_nd_to in self.graph['f'].G[self.look_back['f'][src_nd]]:
                if src_nd_to in self.L['f2g']['train']:
                    src_neighbor_anchors.add(src_nd_to)

            target_neighbor_anchors = set()
            for target_nd_to in self.graph['g'].G[self.look_back['g']
                                                  [target_nd]]:
                if target_nd_to in self.L['g2f']['train']:
                    target_neighbor_anchors.add(target_nd_to)

            cnt_common_neighbors = .0
            AA_measure = .0
            for sna in src_neighbor_anchors:
                for k in range(len(self.L['f2g']['train'][sna])):
                    target_anchor_nd = self.L['f2g']['train'][sna][k]
                    if target_anchor_nd in target_neighbor_anchors:
                        cnt_common_neighbors += 1.
                        AA_measure += 1./np.log((len(self.graph['f'].G[sna])\
                                                +len(self.graph['g'].G[self.L['f2g']['train'][sna][k]]))/2.)
            jaccard = cnt_common_neighbors/(len(self.graph['f'].G[self.look_back['f'][src_nd]])\
                                            +len(self.graph['g'].G[self.look_back['g'][target_nd]])\
                                            -cnt_common_neighbors+1e-6)

            yield [cnt_common_neighbors, jaccard, AA_measure]

    def train(self):

        batches_f2g = batch_iter(self.L, self.batch_size, self.neg_ratio,
                                 self.lookup, 'f', 'g')

        X = list()
        Y = list()
        for batch in batches_f2g:
            pos, neg = batch
            if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len(
                    neg['g']):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            pos_features = list(self.__get_pair_features(pos['f'], pos['g']))
            X.extend(pos_features)
            Y.extend([1 for m in range(len(pos_features))])

            for k in range(self.neg_ratio):
                neg_features = list(
                    self.__get_pair_features(neg['f'][k], neg['g'][k]))
                X.extend(neg_features)
                Y.extend([-1 for m in range(len(neg_features))])

            self.logger.info('Training Model...')
            self.clf.fit(X, Y)
            self.logger.info('Training score: %f' % self.clf.score(X, Y))
            self.logger.info('Complete Training process...')