Esempio n. 1
0
    def __init__(self, graph, lr=.001, rep_size=128, batch_size=100, negative_ratio=5, order=3, table_size=1e8,
                    log_file='log', last_emb_file=None):

        if os.path.exists('log/'+log_file+'.log'):
            os.remove('log/'+log_file+'.log')
        self.logger = LogHandler(log_file)

        self.epsilon = 1e-7
        self.table_size = table_size
        self.sigmoid_table = {}
        self.sigmoid_table_size = 1000
        self.SIGMOID_BOUND = 6

        self._init_simgoid_table()

        self.g = graph
        self.look_up = self.g.look_up_dict
        self.idx = defaultdict(int)
        self.update_dict = defaultdict(dict)
        self.update_look_back = defaultdict(list)

        self.node_size = self.g.node_size
        self.rep_size = rep_size
        
        self._init_params(self.node_size, rep_size, last_emb_file)

        self.order = order
        self.lr = lr
        self.cur_epoch = 0
        self.batch_size = batch_size
        self.negative_ratio = negative_ratio
Esempio n. 2
0
    def __init__(self, graph, attr_file, anchorfile, use_net, valid_prop,
                 neg_ratio, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        if not isinstance(graph, dict):
            self.logger.error('The graph must contain src and target graphs.')
            return

        self.use_net = use_net
        self.graph = graph
        self.lookup = dict()
        self.lookup['f'] = self.graph['f'].look_up_dict
        self.lookup['g'] = self.graph['g'].look_up_dict
        self.look_back = dict()
        self.look_back['f'] = self.graph['f'].look_back_list
        self.look_back['g'] = self.graph['g'].look_back_list
        self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop)

        self.attributes = dict()
        if attr_file:
            self.attributes['f'] = self._set_node_attributes(attr_file[0])
            self.attributes['g'] = self._set_node_attributes(attr_file[1])

        self.neg_ratio = neg_ratio
        self.batch_size = 1024

        self.clf = svm.SVC(probability=True)
Esempio n. 3
0
    def __init__(self, learning_rate, batch_size, neg_ratio, gamma, eta,
                 n_input, n_out, n_hidden, n_layer, type_model, is_valid,
                 device, files, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.device = device

        self.type_model = type_model

        # Parameters
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.neg_ratio = neg_ratio
        self.valid = is_valid
        self.valid_prop = .9 if self.valid else 1.
        self.valid_sample_size = 10

        self.gamma = gamma
        self.eta = eta

        self.cur_epoch = 1

        # Network Parameters
        self.n_hidden = n_hidden if type_model == 'mlp' else n_input  # number of neurons in hidden layer
        self.n_input = n_input  # size of node embeddings
        self.n_out = n_out  # hashing code
        self.n_layer = n_layer  # number of layer

        # Set Train Data
        if not isinstance(files, list) and len(files) < 3:
            self.logger.info(
                'The alogrihtm needs inputs: feature-src, feature-end, identity-linkage'
            )
            return

        # tf Graph input
        self.lookup = defaultdict(dict)
        self.look_back = defaultdict(list)
        self._read_train_dat(
            files)  # features from source, features from end, label file
        self.valid_sample_size = min(
            min(self.valid_sample_size,
                len(self.look_back['src']) - 1),
            len(self.look_back['end']) - 1)

        # TF Graph Building
        self.sess = tf.Session()
        cur_seed = random.getrandbits(32)
        initializer = tf.contrib.layers.xavier_initializer(uniform=False,
                                                           seed=cur_seed)
        with tf.device(self.device):
            with tf.variable_scope("model",
                                   reuse=None,
                                   initializer=initializer):
                self._init_weights()
                self.build_graph(type_model)
                self.build_valid_graph(type_model)
            self.sess.run(tf.global_variables_initializer())
Esempio n. 4
0
    def __init__(self, graph, lr=.001, rep_size=128, batch_size=100, negative_ratio=5\
                    , order=3, table_size=1e8, embedfile=None, anchorfile=None, log_file='log'):

        if not embedfile or not anchorfile:
            return

        if os.path.exists('log/'+log_file+'.log'):
            os.remove('log/'+log_file+'.log')
        self.logger = LogHandler(log_file)

        self.epsilon = 1e-7
        self.table_size = table_size
        self.sigmoid_table = {}
        self.sigmoid_table_size = 1000
        self.SIGMOID_BOUND = 6

        self._init_simgoid_table()

        self.g = graph
        self.look_up = self.g.look_up_dict
        self.idx = defaultdict(int)
        self.update_dict = defaultdict(dict)
        self.update_look_back = defaultdict(list)

        self.node_size = graph.G.number_of_nodes()
        self.rep_size = rep_size
        
        self.order = order
        self.lr = lr
        self.cur_epoch = 0
        self.batch_size = batch_size
        self.negative_ratio = negative_ratio

        self._gen_sampling_table()
        self._init_params(self.node_size, rep_size, embedfile, anchorfile)
Esempio n. 5
0
    def __init__(self, learning_rate, batch_size, neg_ratio, n_input, n_out,
                 n_hidden, n_layer, device, files, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.device = device

        # Parameters
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.neg_ratio = neg_ratio
        self.valid_prop = .9
        self.valid_sample_size = 9

        self.gamma = 1
        self.eta = 0

        self.cur_epoch = 1

        # Network Parameters
        self.n_hidden = n_hidden  # number of neurons in hidden layer
        self.n_input = n_input  # size of node embeddings
        self.n_out = n_out  # hashing code
        self.n_layer = n_layer  # number of layer

        # Set Train Data
        if not isinstance(files, list) and len(files) < 3:
            self.logger.info(
                'The alogrihtm needs files like [First Graph File, Second Graph File, Label File]'
            )
            return

        # tf Graph input
        self.lookup_f = dict()
        self.lookup_g = dict()
        self.look_back_f = list()
        self.look_back_g = list()
        self._read_train_dat(files[0], files[1],
                             files[2])  # douban, weibo, label files
        self.valid_sample_size = min(
            min(self.valid_sample_size,
                len(self.look_back_f) - 1),
            len(self.look_back_g) - 1)

        # TF Graph Building
        self.sess = tf.Session()
        cur_seed = random.getrandbits(32)
        initializer = tf.contrib.layers.xavier_initializer(uniform=False,
                                                           seed=cur_seed)
        with tf.device(self.device):
            with tf.variable_scope("model",
                                   reuse=None,
                                   initializer=initializer):
                self.mlp_weights()
                self.build_graph()
                self.build_valid_graph()
            self.sess.run(tf.global_variables_initializer())
Esempio n. 6
0
    def __init__(self,
                 layer_graphs,
                 anchor_file,
                 lr=.001,
                 nd_rep_size=16,
                 layer_rep_size=16,
                 batch_size=100,
                 negative_ratio=5,
                 table_size=1e8,
                 log_file='log',
                 last_emb_file=None):

        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.epsilon = 1e-7
        self.table_size = table_size
        self.sigmoid_table = {}
        self.sigmoid_table_size = 1000
        self.SIGMOID_BOUND = 6

        self._init_simgoid_table()

        self.anchors, num_anchors = self._read_anchors(anchor_file, ',')
        self.logger.info('Number of anchors:%d' % num_anchors)

        self.num_layers = len(layer_graphs)  # number of calculated networks
        self.layer_graphs = layer_graphs  # graphs in different layers
        self.nd_rep_size = nd_rep_size  # representation size of node
        self.layer_rep_size = layer_rep_size  # representation size of layer

        self.idx = 0  # for speeding up calculation

        # self.node_size = 0
        # for i in range(self.num_layers):
        #     self.node_size += layer_graphs[i].node_size
        # self.node_size -= num_anchors
        # print(self.node_size)
        # may need to be revised
        self.update_dict = defaultdict(int)
        self.update_look_back = list()
        self._build_dict(layer_graphs, self.anchors)
        self.logger.info('Number of nodes:%d' % len(self.look_back))

        self.node_size = len(self.look_back)

        self._init_params(self.node_size, self.num_layers, nd_rep_size,
                          layer_rep_size, last_emb_file)

        self.lr = lr
        self.cur_epoch = 0
        self.batch_size = batch_size
        self.negative_ratio = negative_ratio

        self._gen_sampling_table()
Esempio n. 7
0
    def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        if not isinstance(graph, dict):
            self.logger.error('The graph must contain src and target graphs.')
            return

        self.L = load_train_valid_labels(anchorfile, valid_prop)
        self.graph = graph
        self.look_up = dict()
        self.look_up['f'] = self.graph['f'].look_up_dict
        self.look_up['g'] = self.graph['g'].look_up_dict
        self.look_back = dict()
        self.look_back['f'] = self.graph['f'].look_back_list
        self.look_back['g'] = self.graph['g'].look_back_list

        self.neg_ratio = neg_ratio
        self.batch_size = 1024

        self.clf = svm.SVC()
Esempio n. 8
0
    def __init__(self, graphs, lr=.001, gamma=.1, rep_size=128, batch_size=100, negative_ratio=5, table_size=1e8,
                    anchor_file=None, log_file='log', last_emb_files=dict()):

        if os.path.exists('log/'+log_file+'.log'):
            os.remove('log/'+log_file+'.log')
        self.logger = LogHandler(log_file)

        self.epsilon = 1e-7
        self.table_size = table_size
        self.sigmoid_table = {}
        self.sigmoid_table_size = 1000
        self.SIGMOID_BOUND = 6

        self._init_simgoid_table()

        self._init_dicts()
        self.t = 1
        self.rep_size = rep_size
        for graph_type in ['f', 'g']:
            self.g[graph_type] = graphs[graph_type]
            self.look_up[graph_type] = self.g[graph_type].look_up_dict
            self.idx[graph_type] = 0
            self.update_dict[graph_type] = dict()
            self.update_look_back[graph_type] = list()
            self.node_size[graph_type] = self.g[graph_type].node_size
            self.embeddings[graph_type], self.h_delta[graph_type], self.m[graph_type], self.v[graph_type]\
                    = self._init_params(self.node_size[graph_type], rep_size,
                                            last_emb_files, graph_type)
            self._gen_sampling_table(graph_type)

        self.anchors = self._read_anchors(anchor_file, ',')

        self.lr = lr
        self.gamma = gamma
        self.cur_epoch = 0
        self.batch_size = batch_size
        self.negative_ratio = negative_ratio
Esempio n. 9
0
def main(args):
    t1 = time.time()

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id

    logger = LogHandler('RUN.' +
                        time.strftime('%Y-%m-%d', time.localtime(time.time())))
    logger.info(args)

    SAVING_STEP = args.saving_step
    MAX_EPOCHS = args.epochs
    if args.method == 'pale':
        model = PALE(
            learning_rate=args.lr,
            batch_size=args.batch_size,
            n_input=args.input_size,
            n_hidden=args.hidden_size,
            n_layer=args.layers,
            files=[args.embedding1, args.embedding2, args.identity_linkage],
            type_model=args.type_model,
            is_valid=args.is_valid,
            log_file=args.log_file,
            device=args.device)
    if args.method == 'mna' or args.method == 'fruip':
        graph = defaultdict(Graph)
        print("Loading graph...")
        if args.graph_format == 'adjlist':
            if args.graph1:
                graph['f'].read_adjlist(filename=args.graph1)
            if args.graph2:
                graph['g'].read_adjlist(filename=args.graph2)
        if args.graph_format == 'edgelist':
            if args.graph1:
                graph['f'].read_edgelist(filename=args.graph1)
            if args.graph2:
                graph['g'].read_edgelist(filename=args.graph2)

        if args.method == 'mna':
            model = MNA(graph=graph, anchorfile=args.identity_linkage, valid_prop=1.\
                        , neg_ratio=3, log_file=args.log_file)
        if args.method == 'fruip':
            embed_files = [args.embedding1, args.embedding2]
            model = FRUIP(graph=graph,
                          embed_files=embed_files,
                          linkage_file=args.identity_linkage)
            model.main_proc(args.threshold)
    if args.method == 'final':
        main_proc(graph_files=[args.graph1, args.graph2],
                  graph_sizes=[args.graph_size1, args.graph_size2],
                  linkage_file=args.identity_linkage,
                  alpha=args.alpha,
                  epoch=args.epochs,
                  tol=args.tol,
                  graph_format=args.graph_format,
                  test_anchor_file=args.test_anchors,
                  output_file=args.output)

    if args.method in ['pale']:
        losses = np.zeros(MAX_EPOCHS)
        val_scrs = np.zeros(MAX_EPOCHS)
        best_scr = .0
        best_epoch = 0
        thres = 100
        for i in range(1, MAX_EPOCHS + 1):
            losses[i - 1], val_scrs[i - 1] = model.train_one_epoch()
            if i > 0 and i % SAVING_STEP == 0:
                loss_mean = np.mean(losses[i - SAVING_STEP:i])
                scr_mean = np.mean(val_scrs[i - SAVING_STEP:i])
                logger.info(
                    'loss in last {} epoches: {}, validation in last {} epoches: {}'
                    .format(SAVING_STEP, loss_mean, SAVING_STEP, scr_mean))
                if scr_mean > best_scr:
                    best_scr = scr_mean
                    best_epoch = i
                    model.save_models(args.output)
                if args.early_stop and i >= thres * SAVING_STEP:
                    cnt = 0
                    for k in range(thres - 1, -1, -1):
                        cur_val = np.mean(
                            val_scrs[i - (k + 1) * SAVING_STEP:i -
                                     k * SAVING_STEP])
                        if cur_val < best_scr:
                            cnt += 1
                    if cnt == thres and (i -
                                         best_epoch) >= thres * SAVING_STEP:
                        logger.info('*********early stop*********')
                        logger.info(
                            'The best epoch: {}\nThe validation score: {}'.
                            format(best_epoch, best_scr))
                        break
    if args.method in ['mna', 'fruip']:
        model.save_model(args.output)
    t2 = time.time()
    print('time cost:', t2 - t1)
Esempio n. 10
0
class _MNA(object):
    def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        if not isinstance(graph, dict):
            self.logger.error('The graph must contain src and target graphs.')
            return

        self.graph = graph
        self.lookup = dict()
        self.lookup['f'] = self.graph['f'].look_up_dict
        self.lookup['g'] = self.graph['g'].look_up_dict
        self.look_back = dict()
        self.look_back['f'] = self.graph['f'].look_back_list
        self.look_back['g'] = self.graph['g'].look_back_list
        self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop)

        self.neg_ratio = neg_ratio
        self.batch_size = 1024

        self.clf = svm.SVC(probability=True)

    def __get_pair_features(self, src_nds, target_nds):
        pair_features = list()
        if len(src_nds) != len(target_nds):
            self.logger.warn(
                'The size of sampling in processing __get_pair_features is not equal.'
            )
            yield pair_features
        for i in range(len(src_nds)):
            src_nd, target_nd = src_nds[i], target_nds[i]

            src_neighbor_anchors = set()
            for src_nd_to in self.graph['f'].G[self.look_back['f'][src_nd]]:
                if src_nd_to in self.L['f2g']['train']:
                    src_neighbor_anchors.add(src_nd_to)

            target_neighbor_anchors = set()
            for target_nd_to in self.graph['g'].G[self.look_back['g']
                                                  [target_nd]]:
                if target_nd_to in self.L['g2f']['train']:
                    target_neighbor_anchors.add(target_nd_to)

            cnt_common_neighbors = .0
            AA_measure = .0
            for sna in src_neighbor_anchors:
                for k in range(len(self.L['f2g']['train'][sna])):
                    target_anchor_nd = self.L['f2g']['train'][sna][k]
                    if target_anchor_nd in target_neighbor_anchors:
                        cnt_common_neighbors += 1.
                        AA_measure += 1./np.log((len(self.graph['f'].G[sna])\
                                                +len(self.graph['g'].G[self.L['f2g']['train'][sna][k]]))/2.)
            jaccard = cnt_common_neighbors/(len(self.graph['f'].G[self.look_back['f'][src_nd]])\
                                            +len(self.graph['g'].G[self.look_back['g'][target_nd]])\
                                            -cnt_common_neighbors+1e-6)

            yield [cnt_common_neighbors, jaccard, AA_measure]

    def train(self):

        batches_f2g = batch_iter(self.L, self.batch_size, self.neg_ratio,
                                 self.lookup, 'f', 'g')

        X = list()
        Y = list()
        for batch in batches_f2g:
            pos, neg = batch
            if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len(
                    neg['g']):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            pos_features = list(self.__get_pair_features(pos['f'], pos['g']))
            X.extend(pos_features)
            Y.extend([1 for m in range(len(pos_features))])

            for k in range(self.neg_ratio):
                neg_features = list(
                    self.__get_pair_features(neg['f'][k], neg['g'][k]))
                X.extend(neg_features)
                Y.extend([-1 for m in range(len(neg_features))])

            self.logger.info('Training Model...')
            self.clf.fit(X, Y)
            self.logger.info('Training score: %f' % self.clf.score(X, Y))
            self.logger.info('Complete Training process...')
Esempio n. 11
0
class _LINE_ANCHORREG_ALIGN_PRETRAIN(object):

    def __init__(self, graph, lr=.001, gamma=.1, rep_size=128, batch_size=100, negative_ratio=5\
                    , order=3, table_size=1e8, embedfile=None, anchorfile=None, log_file='log'):

        if not embedfile or not anchorfile:
            return

        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.epsilon = 1e-7
        self.table_size = table_size
        self.sigmoid_table = {}
        self.sigmoid_table_size = 1000
        self.SIGMOID_BOUND = 6

        self._init_simgoid_table()

        self.g = graph
        self.look_up = self.g.look_up_dict
        self.idx = defaultdict(int)
        self.update_dict = defaultdict(dict)
        self.update_look_back = defaultdict(list)

        self.node_size = graph.G.number_of_nodes()
        self.rep_size = rep_size

        self.order = order
        self.lr = lr
        self.gamma = gamma
        self.cur_epoch = 0
        self.batch_size = batch_size
        self.negative_ratio = negative_ratio

        self._gen_sampling_table()
        self._init_params(self.node_size, rep_size, embedfile, anchorfile)

    def _init_params(self, node_size, rep_size, embedfile, anchorfile):
        self.embeddings = dict()
        self.embeddings['order1'] = np.random.normal(0, 1,
                                                     (node_size, rep_size))
        self.embeddings['order2'] = np.random.normal(0, 1,
                                                     (node_size, rep_size))
        self.embeddings['content'] = np.random.normal(0, 1,
                                                      (node_size, rep_size))
        self.embeddings['order1'] = self._set_anchor_nds(
            self.embeddings['order1'], embedfile, anchorfile, 1)
        self.embeddings['order2'] = self._set_anchor_nds(
            self.embeddings['order2'], embedfile, anchorfile, 2)
        self._init_update_params(node_size, rep_size)
        self._pre_train()

    def _init_update_params(self, node_size, rep_size):
        # adagrad
        self.h_delta = dict()
        self.h_delta['order1'] = np.zeros((node_size, rep_size))
        self.h_delta['order2'] = np.zeros((node_size, rep_size))
        self.h_delta['content'] = np.zeros((node_size, rep_size))
        # adam
        self.m = dict()
        self.m['order1'] = np.zeros((node_size, rep_size))
        self.m['order2'] = np.zeros((node_size, rep_size))
        self.m['content'] = np.zeros((node_size, rep_size))
        self.v = dict()
        self.v['order1'] = np.zeros((node_size, rep_size))
        self.v['order2'] = np.zeros((node_size, rep_size))
        self.v['content'] = np.zeros((node_size, rep_size))
        self.t = 1

    def _read_anchors(self, anchorfile):
        anchors = list()
        with open(anchorfile, 'r') as anchor_handler:
            for ln in anchor_handler:
                elems = ln.strip().split()
                anchors.append((elems[0], elems[1]))
        return anchors

    def _read_embeddings(self, embedfile):
        embeddings = dict()
        with open(embedfile, 'r') as embed_handler:
            for ln in embed_handler:
                elems = ln.strip().split()
                if len(elems) <= 2:
                    continue
                embeddings[elems[0]] = map(float, elems[1:])
        return embeddings

    def _set_anchor_nds(self, mat, embedfile, anchorfile, order):
        self.anchors = self._read_anchors(anchorfile)
        self.src_embeddings = self._read_embeddings(embedfile)
        self.anchor_idx = set()
        for src_nd, target_nd in self.anchors:
            if not target_nd in self.look_up or not src_nd in self.src_embeddings:
                continue
            if len(mat[self.look_up[target_nd]]) != len(
                    self.src_embeddings[src_nd]):
                self.logger.error(
                    'The length of embeddings at anchor nodes are illegal')
                break
            self.anchor_idx.add(self.look_up[target_nd])
            # if order==1:
            #     mat[self.look_up[target_nd]] = self.src_embeddings[src_nd][0:len(mat[self.look_up[target_nd]])]
            # if order==2:
            #     mat[self.look_up[target_nd]] = self.src_embeddings[src_nd][len(mat[self.look_up[target_nd]]):]
            mat[self.look_up[target_nd]] = self.src_embeddings[src_nd]
        return mat

    def _pre_train(self):
        self.logger.info("Pretraining...")
        DISPLAY_EPOCH = 1000
        order = self.order
        batches = self.batch_iter()
        opt_type = 'adam'
        for batch in batches:
            self.idx = defaultdict(int)
            self.update_look_back = defaultdict(list)
            self.update_dict = defaultdict(dict)
            if order == 1 or order == 3:
                delta_eh_o1 = self._pretrain_update_graph_by_order1(batch)
                len_delta = len(delta_eh_o1)
                # print 'order1 nd'
                if opt_type == 'adagrad':
                    self.h_delta['order1'], self.embeddings['order1'] = \
                                    self.update_vec('nd_order1', self.h_delta['order1'], delta_eh_o1
                                                    , self.embeddings['order1'], len_delta, self.t)
                if opt_type == 'adam':
                    self.m['order1'], self.v['order1'], self.embeddings['order1'] = \
                                    self.update_vec_by_adam('nd_order1', self.m['order1'], self.v['order1'], delta_eh_o1
                                                    , self.embeddings['order1'], len_delta, self.t)
            if order == 2 or order == 3:
                delta_c, delta_eh_o2 = self._pretrain_update_graph_by_order2(
                    batch)
                len_delta = len(delta_eh_o2)
                # print 'order2, nd'
                if opt_type == 'adagrad':
                    self.h_delta['order2'], self.embeddings['order2'] = \
                                        self.update_vec('nd_order2', self.h_delta['order2'], delta_eh_o2
                                                        , self.embeddings['order2'], len_delta, self.t)
                if opt_type == 'adam':
                    self.m['order2'], self.v['order2'], self.embeddings['order2'] = \
                                    self.update_vec_by_adam('nd_order2', self.m['order2'], self.v['order2'], delta_eh_o2
                                                    , self.embeddings['order2'], len_delta, self.t)
                len_content = len(delta_c)
                # print 'order2, content'
                if opt_type == 'adagrad':
                    self.h_delta_c, self.embeddings['content'] = \
                                        self.update_vec('cnt_order2', self.h_delta['content'], delta_c
                                                        , self.embeddings['content'], len_content, self.t)
                if opt_type == 'adam':
                    self.m['content'], self.v['content'], self.embeddings['content'] = \
                                    self.update_vec_by_adam('cnt_order2', self.m['content'], self.v['content'], delta_c
                                                    , self.embeddings['content'], len_content, self.t)
                # self.embeddings_order2[self.update_look_back[:len_de],:] -= self.lr*delta_eh
                # len_content = len(delta_c)
                # self.content_embeddings[self.update_look_back[:len_content],:] -= self.lr*delta_c
                # break
            if (self.t - 1) % DISPLAY_EPOCH == 0:
                self.get_cur_batch_loss(self.t, batch)
            self.t += 1
        self._init_update_params(self.node_size, self.rep_size)
        self.logger.info("End of Pretraining")

    def _init_simgoid_table(self):
        for k in range(self.sigmoid_table_size):
            x = 2 * self.SIGMOID_BOUND * k / self.sigmoid_table_size - self.SIGMOID_BOUND
            self.sigmoid_table[k] = 1. / (1 + np.exp(-x))

    def _fast_sigmoid(self, val):
        if val > self.SIGMOID_BOUND:
            return 1 - self.epsilon
        elif val < -self.SIGMOID_BOUND:
            return self.epsilon
        k = int((val + self.SIGMOID_BOUND) * self.sigmoid_table_size /
                self.SIGMOID_BOUND / 2)
        return self.sigmoid_table[k]
        # return 1./(1+np.exp(-val))

    def _pretrain_update_graph_by_order2(self, batch):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        pos_h, pos_t, pos_h_v, neg_t = batch
        batch_size = len(pos_h)
        # print pos_h, pos_t, pos_h_v, neg_t

        # order 2
        pos_u = self.embeddings['order2'][pos_h, :]
        pos_v_c = self.embeddings['content'][pos_t, :]
        neg_u = self.embeddings['order2'][pos_h_v, :]
        neg_v_c = self.embeddings['content'][neg_t, :]

        pos_e = np.sum(pos_u * pos_v_c, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v_c,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        # temporal delta
        delta_eh = list()
        delta_c = list()

        for i in range(len(pos_t)):
            u, v = pos_h[i], pos_t[i]
            if not v in self.anchor_idx:
                delta_c = self._calc_delta_vec('cnt_order2', v, delta_c,
                                               (sigmoid_pos_e[i] - 1) *
                                               pos_u[i, :])
            if not u in self.anchor_idx:
                delta_eh = self._calc_delta_vec('nd_order2', u, delta_eh,
                                                (sigmoid_pos_e[i] - 1) *
                                                pos_v_c[i, :])
            # print 'delta_eh',delta_eh,ndDict_order
        neg_shape = neg_e.shape
        for i in range(neg_shape[0]):
            for j in range(neg_shape[1]):
                u, v = pos_h_v[i][j], neg_t[i][j]
                if not v in self.anchor_idx:
                    delta_c = self._calc_delta_vec(
                        'cnt_order2', v, delta_c,
                        sigmoid_neg_e[i, j] * neg_u[i, j, :])
                if not u in self.anchor_idx:
                    delta_eh = self._calc_delta_vec(
                        'nd_order2', u, delta_eh,
                        sigmoid_neg_e[i, j] * neg_v_c[i, j, :])
                # print sigmoid_neg_e[i,j]*neg_v_c[i,j,:], type(sigmoid_neg_e[i,j]*neg_v_c[i,j,:])
                # print 'delta_eh',delta_eh,ndDict_order

        # delta x & delta codebook
        delta_eh = self._format_vec('nd_order2', delta_eh)
        delta_c = self._format_vec('cnt_order2', delta_c)

        return delta_c / batch_size, delta_eh / batch_size

    def _pretrain_update_graph_by_order1(self, batch):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        pos_h, pos_t, pos_h_v, neg_t = batch
        batch_size = len(pos_h)

        # order 1
        pos_u = self.embeddings['order1'][pos_h, :]
        pos_v = self.embeddings['order1'][pos_t, :]
        neg_u = self.embeddings['order1'][pos_h_v, :]
        neg_v = self.embeddings['order1'][neg_t, :]

        pos_e = np.sum(pos_u * pos_v, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        # delta calculation
        delta_eh = list()

        for i in range(len(pos_t)):
            u, v = pos_h[i], pos_t[i]
            if not v in self.anchor_idx:
                delta_eh = self._calc_delta_vec('nd_order1', v, delta_eh,
                                                (sigmoid_pos_e[i] - 1) *
                                                pos_u[i, :])
            if not u in self.anchor_idx:
                delta_eh = self._calc_delta_vec('nd_order1', u, delta_eh,
                                                (sigmoid_pos_e[i] - 1) *
                                                pos_v[i, :])
        neg_shape = neg_e.shape
        for i in range(neg_shape[0]):
            for j in range(neg_shape[1]):
                u, v = pos_h_v[i][j], neg_t[i][j]
                if not v in self.anchor_idx:
                    delta_eh = self._calc_delta_vec(
                        'nd_order1', v, delta_eh,
                        sigmoid_neg_e[i, j] * neg_u[i, j, :])
                if not u in self.anchor_idx:
                    delta_eh = self._calc_delta_vec(
                        'nd_order1', u, delta_eh,
                        sigmoid_neg_e[i, j] * neg_v[i, j, :])

        # delta x & delta codebook
        delta_eh = self._format_vec('nd_order1', delta_eh)

        return delta_eh / batch_size

    def _cos_sim(self, vec1, vec2):
        return np.dot(vec1, vec2) / np.linalg.norm(vec1) / np.linalg.norm(vec2)

    def _update_graph_by_anchor_reg(self):

        delta_eh = list()

        cnt = 0
        for src_nd, target_nd in self.anchors:
            if not src_nd in self.src_embeddings or not target_nd in self.look_up:
                continue
            src_emb = np.array(self.src_embeddings[src_nd])
            if self.order == 2:
                target_emb = self.embeddings['order2'][self.look_up[target_nd]]
            if self.order == 1:
                target_emb = self.embeddings['order1'][self.look_up[target_nd]]
            delta_eh = self._calc_delta_vec(
                'nd_order2', self.look_up[target_nd], delta_eh,
                (self._cos_sim(src_emb, target_emb) * target_emb /
                 np.dot(target_emb, target_emb) - src_emb /
                 np.linalg.norm(src_emb) / np.linalg.norm(target_emb)) /
                self._cos_sim(src_emb, target_emb))
            cnt += 1

        if self.order == 2:
            delta_eh = self._format_vec('nd_order2', delta_eh)
        if self.order == 1:
            delta_eh = self._format_vec('nd_order1', delta_eh)

        return delta_eh / cnt

    def _format_vec(self, cal_type, vec):
        len_gap = self.idx[cal_type] - len(vec)
        if len_gap > 0:
            for i in range(len_gap):
                if isinstance(vec, list):
                    vec.append(np.zeros(vec[0].shape))
                else:
                    vec = np.vstack((vec, np.zeros(vec[0].shape)))
        return np.array(vec)

    def _calc_delta_vec(self, cal_type, nd, delta, opt_vec):
        if nd not in self.update_dict[cal_type]:
            cur_idx = self.idx[cal_type]
            self.update_dict[cal_type][nd] = cur_idx
            self.update_look_back[cal_type].append(nd)
            self.idx[cal_type] += 1
        else:
            cur_idx = self.update_dict[cal_type][nd]
        if cur_idx >= len(delta):
            for i in range(cur_idx - len(delta)):
                delta.append(np.zeros(opt_vec.shape))
            delta.append(opt_vec)
        else:
            delta[cur_idx] += opt_vec
        return delta

    def _update_graph_by_order2(self, batch):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        pos_h, pos_t, pos_h_v, neg_t = batch
        batch_size = len(pos_h)
        # print pos_h, pos_t, pos_h_v, neg_t

        # order 2
        pos_u = self.embeddings['order2'][pos_h, :]
        pos_v_c = self.embeddings['content'][pos_t, :]
        neg_u = self.embeddings['order2'][pos_h_v, :]
        neg_v_c = self.embeddings['content'][neg_t, :]

        pos_e = np.sum(pos_u * pos_v_c, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v_c,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        # temporal delta
        delta_eh = list()
        delta_c = list()

        for i in range(len(pos_t)):
            u, v = pos_h[i], pos_t[i]
            delta_c = self._calc_delta_vec(
                'cnt_order2', v, delta_c, (sigmoid_pos_e[i] - 1) * pos_u[i, :])
            delta_eh = self._calc_delta_vec('nd_order2', u, delta_eh,
                                            (sigmoid_pos_e[i] - 1) *
                                            pos_v_c[i, :])
            # print 'delta_eh',delta_eh,ndDict_order
        neg_shape = neg_e.shape
        for i in range(neg_shape[0]):
            for j in range(neg_shape[1]):
                u, v = pos_h_v[i][j], neg_t[i][j]
                delta_c = self._calc_delta_vec(
                    'cnt_order2', v, delta_c,
                    sigmoid_neg_e[i, j] * neg_u[i, j, :])
                delta_eh = self._calc_delta_vec(
                    'nd_order2', u, delta_eh,
                    sigmoid_neg_e[i, j] * neg_v_c[i, j, :])
                # print sigmoid_neg_e[i,j]*neg_v_c[i,j,:], type(sigmoid_neg_e[i,j]*neg_v_c[i,j,:])
                # print 'delta_eh',delta_eh,ndDict_order

        # delta x & delta codebook
        delta_eh = self._format_vec('nd_order2', delta_eh)
        delta_c = self._format_vec('cnt_order2', delta_c)

        return delta_c / batch_size, delta_eh / batch_size

    def _update_graph_by_order1(self, batch):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        pos_h, pos_t, pos_h_v, neg_t = batch
        batch_size = len(pos_h)

        # order 1
        pos_u = self.embeddings['order1'][pos_h, :]
        pos_v = self.embeddings['order1'][pos_t, :]
        neg_u = self.embeddings['order1'][pos_h_v, :]
        neg_v = self.embeddings['order1'][neg_t, :]

        pos_e = np.sum(pos_u * pos_v, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        # delta calculation
        delta_eh = list()

        for i in range(len(pos_t)):
            u, v = pos_h[i], pos_t[i]
            delta_eh = self._calc_delta_vec(
                'nd_order1', v, delta_eh, (sigmoid_pos_e[i] - 1) * pos_u[i, :])
            delta_eh = self._calc_delta_vec(
                'nd_order1', u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v[i, :])
        neg_shape = neg_e.shape
        for i in range(neg_shape[0]):
            for j in range(neg_shape[1]):
                u, v = pos_h_v[i][j], neg_t[i][j]
                delta_eh = self._calc_delta_vec(
                    'nd_order1', v, delta_eh,
                    sigmoid_neg_e[i, j] * neg_u[i, j, :])
                delta_eh = self._calc_delta_vec(
                    'nd_order1', u, delta_eh,
                    sigmoid_neg_e[i, j] * neg_v[i, j, :])

        # delta x & delta codebook
        delta_eh = self._format_vec('nd_order1', delta_eh)

        return delta_eh / batch_size

    def _mat_add(self, mat1, mat2):
        # print '****mat add****'
        # print mat1, mat2
        len_gap = len(mat1) - len(mat2)
        # print len_gap
        if len_gap > 0:
            for i in range(len_gap):
                mat2 = np.vstack((mat2, np.zeros(mat2[0, :].shape)))
                # print mat2
        else:
            for i in range(-len_gap):
                mat1 = np.vstack((mat1, np.zeros(mat1[0, :].shape)))
        #         print mat1
        # print len(mat1), len(mat2)
        return mat1 + mat2

    def get_anchor_reg_loss(self):

        cos_sim_list = list()

        for src_nd, target_nd in self.anchors:
            if not src_nd in self.src_embeddings or not target_nd in self.look_up:
                continue
            src_emb = np.array(self.src_embeddings[src_nd])
            target_emb = self.embeddings['order2'][self.look_up[target_nd]]
            cos_sim_list.append(self._cos_sim(src_emb, target_emb))

        return -np.mean(cos_sim_list)

    def get_graph_loss_by_order2(self, batch):
        pos_h, pos_t, pos_h_v, neg_t = batch

        # order 2
        pos_u = self.embeddings['order2'][pos_h, :]
        pos_v_c = self.embeddings['content'][pos_t, :]
        neg_u = self.embeddings['order2'][pos_h_v, :]
        neg_v_c = self.embeddings['content'][neg_t, :]

        pos_e = np.sum(pos_u * pos_v_c, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v_c,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        return -np.mean(
            np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1))

    def get_graph_loss_by_order1(self, batch):
        pos_h, pos_t, pos_h_v, neg_t = batch

        # order 2
        pos_u = self.embeddings['order1'][pos_h, :]
        pos_v = self.embeddings['order1'][pos_t, :]
        neg_u = self.embeddings['order1'][pos_h_v, :]
        neg_v = self.embeddings['order1'][neg_t, :]

        # pos_e_1 = np.sum(pos_u*pos_v, axis=1)+np.sum(self.b_e[key][0][pos_t,:], axis=1) # pos_e.shape = batch_size
        # neg_e_1 = np.sum(neg_u*neg_v, axis=2)+np.sum(self.b_e[key][0][neg_t,:], axis=2) # neg_e.shape = batch_size*negative_ratio
        pos_e = np.sum(pos_u * pos_v, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        return -np.mean(
            np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1))

    def get_cur_batch_loss(self, t, batch):
        DISPLAY_EPOCH = 1
        if t % DISPLAY_EPOCH == 0:
            loss_order_1 = 0.0
            loss_order_2 = 0.0
            if self.order == 1 or self.order == 3:
                loss_order_1 += self.get_graph_loss_by_order1(batch)
            if self.order == 2 or self.order == 3:
                anchor_loss = self.get_anchor_reg_loss()
                loss_order_2 += self.get_graph_loss_by_order2(
                    batch) + anchor_loss
            if self.order == 1:
                self.logger.info(
                    'Finish processing batch {} and loss from order 1:{}'.
                    format(t, loss_order_1))
            elif self.order == 2:
                self.logger.info(
                    'Finish processing batch {} and loss from order 2:{} and anchor loss:{}'
                    .format(t, loss_order_2, anchor_loss))
            elif self.order == 3:
                self.logger.info(
                    'Finish processing batch {} and loss from order 3:{}'.
                    format(t, loss_order_1 + loss_order_2))

    def update_vec(self, cal_type, h_delta, delta, embeddings, len_delta, t):
        h_delta[self.update_look_back[cal_type][:len_delta], :] += delta**2
        # print 'original embedding:',embeddings[self.update_look_back[cal_type][:len_delta]]
        embeddings[self.update_look_back[cal_type][:len_delta],:] -= \
                                self.lr/np.sqrt(h_delta[self.update_look_back[cal_type][:len_delta],:])*delta
        # print 'delta:',delta
        # print 'h_delta:',h_delta[self.update_look_back[cal_type][:len_delta]]
        # print 'embeddings:',embeddings[self.update_look_back[cal_type][:len_delta]]
        # print 'lmd_rda:',elem_lbd
        return h_delta, embeddings

    def update_vec_by_adam(self, cal_type, m, v, delta, embeddings, len_delta,
                           t):
        self.beta1 = .9
        self.beta2 = .999
        m[self.update_look_back[cal_type][:len_delta],:] = \
            self.beta1*m[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta1)*delta
        v[self.update_look_back[cal_type][:len_delta],:] = \
            self.beta2*v[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta2)*(delta**2)
        m_ = m[self.update_look_back[cal_type][:len_delta], :] / (
            1 - self.beta1**t)
        v_ = v[self.update_look_back[cal_type][:len_delta], :] / (
            1 - self.beta2**t)

        embeddings[
            self.update_look_back[cal_type][:len_delta], :] -= self.lr * m_ / (
                np.sqrt(v_) + self.epsilon)

        return m, v, embeddings

    def train_one_epoch(self):
        DISPLAY_EPOCH = 1000
        order = self.order
        batches = self.batch_iter()
        opt_type = 'adam'
        for batch in batches:
            self.idx = defaultdict(int)
            self.update_look_back = defaultdict(list)
            self.update_dict = defaultdict(dict)
            if order == 1 or order == 3:
                delta_eh_o1 = self._update_graph_by_order1(batch)
                len_delta = len(delta_eh_o1)
                # print 'order1 nd'
                if opt_type == 'adagrad':
                    self.h_delta['order1'], self.embeddings['order1'] = \
                                    self.update_vec('nd_order1', self.h_delta['order1'], delta_eh_o1
                                                    , self.embeddings['order1'], len_delta, self.t)
                if opt_type == 'adam':
                    self.m['order1'], self.v['order1'], self.embeddings['order1'] = \
                                    self.update_vec_by_adam('nd_order1', self.m['order1'], self.v['order1'], delta_eh_o1
                                                    , self.embeddings['order1'], len_delta, self.t)
            if order == 2 or order == 3:
                delta_c, delta_eh_o2 = self._update_graph_by_order2(batch)
                delta_eh_anchor_reg = self._update_graph_by_anchor_reg()
                delta_eh_o2 = self._format_vec('nd_order2', delta_eh_o2)
                len_delta = len(delta_eh_o2)
                # print 'order2, nd'
                if opt_type == 'adagrad':
                    self.h_delta['order2'], self.embeddings['order2'] = \
                                        self.update_vec('nd_order2', self.h_delta['order2']
                                                        , delta_eh_o2+self.gamma*delta_eh_anchor_reg
                                                        , self.embeddings['order2'], len_delta, self.t)
                if opt_type == 'adam':
                    self.m['order2'], self.v['order2'], self.embeddings['order2'] = \
                                    self.update_vec_by_adam('nd_order2', self.m['order2'], self.v['order2']
                                                    , delta_eh_o2+self.gamma*delta_eh_anchor_reg
                                                    , self.embeddings['order2'], len_delta, self.t)
                len_content = len(delta_c)
                # print 'order2, content'
                if opt_type == 'adagrad':
                    self.h_delta_c, self.embeddings['content'] = \
                                        self.update_vec('cnt_order2', self.h_delta['content'], delta_c
                                                        , self.embeddings['content'], len_content, self.t)
                if opt_type == 'adam':
                    self.m['content'], self.v['content'], self.embeddings['content'] = \
                                    self.update_vec_by_adam('cnt_order2', self.m['content'], self.v['content'], delta_c
                                                    , self.embeddings['content'], len_content, self.t)
                # self.embeddings_order2[self.update_look_back[:len_de],:] -= self.lr*delta_eh
                # len_content = len(delta_c)
                # self.content_embeddings[self.update_look_back[:len_content],:] -= self.lr*delta_c
                # break
            if (self.t - 1) % DISPLAY_EPOCH == 0:
                self.get_cur_batch_loss(self.t, batch)
            self.t += 1
        self.cur_epoch += 1

    def get_random_node_pairs(self, i, shuffle_indices, edges, edge_set,
                              numNodes):
        # balance the appearance of edges according to edge_prob
        if not random.random() < self.edge_prob[shuffle_indices[i]]:
            shuffle_indices[i] = self.edge_alias[shuffle_indices[i]]
        cur_h = edges[shuffle_indices[i]][0]
        head = cur_h * numNodes
        cur_t = edges[shuffle_indices[i]][1]
        cur_h_v = []
        cur_neg_t = []
        for j in range(self.negative_ratio):
            rn = self.sampling_table[random.randint(0, self.table_size - 1)]
            while head + rn in edge_set or cur_h == rn or rn in cur_neg_t:
                rn = self.sampling_table[random.randint(
                    0, self.table_size - 1)]
                idx = random.randint(0, self.table_size - 1)
            cur_h_v.append(cur_h)
            cur_neg_t.append(rn)
        return cur_h, cur_t, cur_h_v, cur_neg_t

    def batch_iter(self):

        numNodes = self.node_size

        edges = [(self.look_up[x[0]], self.look_up[x[1]])
                 for x in self.g.G.edges()]
        data_size = self.g.G.number_of_edges()
        edge_set = set([x[0] * numNodes + x[1] for x in edges])
        shuffle_indices = np.random.permutation(np.arange(data_size))

        start_index = 0
        end_index = min(start_index + self.batch_size, data_size)
        while start_index < data_size:
            ret = {}
            pos_h = []
            pos_t = []
            pos_h_v = []
            neg_t = []
            for i in range(start_index, end_index):
                cur_h, cur_t, cur_h_v, cur_neg_t = self.get_random_node_pairs(
                    i, shuffle_indices, edges, edge_set, numNodes)
                pos_h.append(cur_h)
                pos_t.append(cur_t)
                pos_h_v.append(cur_h_v)
                neg_t.append(cur_neg_t)
            ret = (pos_h, pos_t, pos_h_v, neg_t)

            start_index = end_index
            end_index = min(start_index + self.batch_size, data_size)

            yield ret

    def _gen_sampling_table(self):
        table_size = self.table_size
        power = 0.75
        numNodes = self.node_size

        print "Pre-procesing for non-uniform negative sampling!"
        self.node_degree = np.zeros(numNodes)  # out degree

        look_up = self.g.look_up_dict
        for edge in self.g.G.edges():
            self.node_degree[look_up[edge[0]]] += self.g.G[edge[0]][
                edge[1]]["weight"]

        norm = sum(
            [math.pow(self.node_degree[i], power) for i in range(numNodes)])

        self.sampling_table = np.zeros(int(table_size), dtype=np.uint32)

        p = 0
        i = 0
        for j in range(numNodes):
            p += float(math.pow(self.node_degree[j], power)) / norm
            while i < table_size and float(i) / table_size < p:
                self.sampling_table[i] = j
                i += 1

        data_size = self.g.G.number_of_edges()
        self.edge_alias = np.zeros(data_size, dtype=np.int32)
        self.edge_prob = np.zeros(data_size, dtype=np.float32)
        large_block = np.zeros(data_size, dtype=np.int32)
        small_block = np.zeros(data_size, dtype=np.int32)

        total_sum = sum([
            self.g.G[edge[0]][edge[1]]["weight"] for edge in self.g.G.edges()
        ])
        norm_prob = [
            self.g.G[edge[0]][edge[1]]["weight"] * data_size / total_sum
            for edge in self.g.G.edges()
        ]
        num_small_block = 0
        num_large_block = 0
        cur_small_block = 0
        cur_large_block = 0
        for k in range(data_size - 1, -1, -1):
            if norm_prob[k] < 1:
                small_block[num_small_block] = k
                num_small_block += 1
            else:
                large_block[num_large_block] = k
                num_large_block += 1
        while num_small_block and num_large_block:
            num_small_block -= 1
            cur_small_block = small_block[num_small_block]
            num_large_block -= 1
            cur_large_block = large_block[num_large_block]
            self.edge_prob[cur_small_block] = norm_prob[cur_small_block]
            self.edge_alias[cur_small_block] = cur_large_block
            norm_prob[cur_large_block] = norm_prob[
                cur_large_block] + norm_prob[cur_small_block] - 1
            if norm_prob[cur_large_block] < 1:
                small_block[num_small_block] = cur_large_block
                num_small_block += 1
            else:
                large_block[num_large_block] = cur_large_block
                num_large_block += 1

        while num_large_block:
            num_large_block -= 1
            self.edge_prob[large_block[num_large_block]] = 1
        while num_small_block:
            num_small_block -= 1
            self.edge_prob[small_block[num_small_block]] = 1

    def save_embeddings(self, outfile):
        vectors = self.get_vectors()
        for c in vectors.keys():
            if 'node_embeddings' in c or 'content_embeddings' in c:
                # outfile-[node_embeddings/content-embeddings]-[src/obj]
                fout = open('{}.{}'.format(outfile, c), 'w')
                node_num = len(vectors[c].keys())
                fout.write("{} {}\n".format(node_num, self.rep_size))
                for node, vec in vectors[c].items():
                    fout.write("{} {}\n".format(
                        node, ' '.join([str(x) for x in vec])))
                fout.close()
        if self.order == 3:
            fout = open('{}.node_embedding_all'.format(outfile), 'w')
            node_num = len(vectors[c].keys())
            fout.write("{} {}\n".format(node_num, self.rep_size * 2))
            for node, vec in vectors['node_embeddings_order1'].items():
                fout.write("{} {} {}\n".format(
                    node, ' '.join([str(x) for x in vec]), ' '.join([
                        str(x) for x in vectors['node_embeddings_order2'][node]
                    ])))
            fout.close()

    def get_one_embeddings(self, embeddings):
        vectors = dict()
        look_back = self.g.look_back_list
        for i, embedding in enumerate(embeddings):
            vectors[look_back[i]] = embedding
        return vectors

    def get_vectors(self):
        order = self.order
        ret = dict()
        node_embeddings_order1 = self.get_one_embeddings(
            self.embeddings['order1'])
        ret['node_embeddings_order1'] = node_embeddings_order1
        node_embeddings_order2 = self.get_one_embeddings(
            self.embeddings['order2'])
        ret['node_embeddings_order2'] = node_embeddings_order2

        if order == 2 or order == 3:
            content_embeddings = dict()
            content_embeddings = self.get_one_embeddings(
                self.embeddings['content'])
            ret['content_embeddings'] = content_embeddings

        return ret
Esempio n. 12
0
class _FFVM(object):
    def __init__(self,
                 graph,
                 lr=.001,
                 rep_size=128,
                 batch_size=100,
                 negative_ratio=5,
                 order=3,
                 table_size=1e8,
                 log_file='log',
                 last_emb_file=None):

        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.epsilon = 1e-7
        self.table_size = table_size
        self.sigmoid_table = {}
        self.sigmoid_table_size = 1000
        self.SIGMOID_BOUND = 6

        self._init_simgoid_table()

        self.g = graph
        self.look_up = self.g.look_up_dict
        self.idx = defaultdict(int)
        self.update_dict = defaultdict(dict)
        self.update_look_back = defaultdict(list)

        self.node_size = self.g.node_size
        self.rep_size = rep_size

        self._init_params(self.node_size, rep_size, last_emb_file)

        self.order = order
        self.lr = lr
        self.cur_epoch = 0
        self.batch_size = batch_size
        self.negative_ratio = negative_ratio

    def _init_params(self, node_size, rep_size, last_emb_file):
        self.embeddings = dict()
        self.embeddings['node'] = np.random.normal(0, 1, (node_size, rep_size))
        self.embeddings['content'] = np.random.normal(0, 1,
                                                      (node_size, rep_size))
        if last_emb_file:
            self.embeddings['node'] = self._init_emb_matrix(self.embeddings['node']\
                        , '{}.node_embeddings'.format(last_emb_file))
            self.embeddings['content'] = self._init_emb_matrix(self.embeddings['content']\
                        , '{}.content_embeddings'.format(last_emb_file))
        self.embeddings['node'] = np.vstack(
            (self.embeddings['node'], np.zeros(rep_size)))  # for "-1" nodes
        # adagrad
        self.h_delta = dict()
        self.h_delta['node'] = np.zeros((node_size, rep_size))
        self.h_delta['content'] = np.zeros((node_size, rep_size))
        # adam
        self.m = dict()
        self.m['node'] = np.zeros((node_size, rep_size))
        self.m['content'] = np.zeros((node_size, rep_size))
        self.v = dict()
        self.v['node'] = np.zeros((node_size, rep_size))
        self.v['content'] = np.zeros((node_size, rep_size))
        self.t = 1

    def _init_emb_matrix(self, emb, emb_file):
        with open(emb_file, 'r') as embed_handler:
            for ln in embed_handler:
                elems = ln.strip().split()
                if len(elems) <= 2:
                    continue
                emb[self.look_up[elems[0]]] = map(float, elems[1:])
        return emb

    def _init_simgoid_table(self):
        for k in range(self.sigmoid_table_size):
            x = 2 * self.SIGMOID_BOUND * k / self.sigmoid_table_size - self.SIGMOID_BOUND
            self.sigmoid_table[k] = 1. / (1 + np.exp(-x))

    def _fast_sigmoid(self, val):
        if val > self.SIGMOID_BOUND:
            return 1 - self.epsilon
        elif val < -self.SIGMOID_BOUND:
            return self.epsilon
        k = int((val + self.SIGMOID_BOUND) * self.sigmoid_table_size /
                self.SIGMOID_BOUND / 2)
        return self.sigmoid_table[k]
        # return 1./(1+np.exp(-val))

    def _format_vec(self, cal_type, vec):
        len_gap = len(vec) - self.idx[cal_type]
        if len_gap > 0:
            for i in range(len_gap):
                vec.append(np.zeros(vec[0].shape))
        return np.array(vec)

    def _calc_delta_vec(self, cal_type, nd, delta, opt_vec):
        if nd not in self.update_dict[cal_type]:
            cur_idx = self.idx[cal_type]
            self.update_dict[cal_type][nd] = cur_idx
            self.update_look_back[cal_type].append(nd)
            self.idx[cal_type] += 1
        else:
            cur_idx = self.update_dict[cal_type][nd]
        if cur_idx >= len(delta):
            for i in range(cur_idx - len(delta)):
                delta.append(np.zeros(opt_vec.shape))
            delta.append(opt_vec)
        else:
            delta[cur_idx] += opt_vec
        return delta

    def _update_graph(self, batch):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        sp_nds, sp_neighbors = batch
        batch_size = len(sp_nds)

        # order 1
        pos_q = self.embeddings['content'][sp_nds, :]
        pos_c = np.sum(self.embeddings['node'][sp_neighbors, :], axis=1)
        neg_q = self.embeddings['content'][sp_neighbors, :]

        neg_c = list()
        for c in pos_c:
            neg_c.append(np.tile(c, (self.negative_ratio, 1)))
        neg_c = np.array(neg_c)

        pos_e = np.sum(pos_q * pos_c, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_q * neg_c,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        # delta calculation
        delta_q = list()
        delta_f = list()

        idx = 0
        for i in range(len(sp_nds)):
            u, neighbors = sp_nds[i], sp_neighbors[i]
            delta_q = self._calc_delta_vec(
                'content', u, delta_q, (sigmoid_pos_e[i] - 1) * pos_c[i, :])
            for v in neighbors:
                if v != -1:
                    delta_f = self._calc_delta_vec('node', v, delta_f,
                                                   (sigmoid_pos_e[i] - 1) *
                                                   pos_q[i, :])

        for i in range(len(sp_neighbors)):
            neighbors = sp_neighbors[i]
            for j in range(len(neighbors)):
                u = sp_neighbors[i][j]
                if u != -1:
                    delta_q = self._calc_delta_vec(
                        'content', u, delta_q,
                        sigmoid_neg_e[i, j] * neg_c[i, j, :])
                for v in neighbors:
                    delta_f = self._calc_delta_vec(
                        'node', v, delta_f,
                        sigmoid_neg_e[i, j] * neg_q[i, j, :])

        delta_q = self._format_vec('content', delta_q)
        delta_f = self._format_vec('node', delta_f)

        return delta_q / batch_size, delta_f / batch_size

    def get_graph_loss(self, batch):
        sp_nds, sp_neighbors = batch
        batch_size = len(sp_nds)

        # order 1
        pos_q = self.embeddings['content'][sp_nds, :]
        pos_c = np.sum(self.embeddings['node'][sp_neighbors, :], axis=1)
        neg_q = self.embeddings['content'][sp_neighbors, :]

        neg_c = list()
        for c in pos_c:
            neg_c.append(np.tile(c, (self.negative_ratio, 1)))
        neg_c = np.array(neg_c)

        pos_e = np.sum(pos_q * pos_c, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_q * neg_c,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        return -np.mean(
            np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1))

    def get_cur_batch_loss(self, t, batch):
        loss = self.get_graph_loss(batch)
        self.logger.info('Finish processing batch {} and loss:{}'.format(
            t, loss))

    def update_vec(self, cal_type, h_delta, delta, embeddings, len_delta, t):
        h_delta[self.update_look_back[cal_type][:len_delta], :] += delta**2
        # print 'original embedding:',embeddings[self.update_look_back[cal_type][:len_delta]]
        embeddings[self.update_look_back[cal_type][:len_delta],:] -= \
                                self.lr/np.sqrt(h_delta[self.update_look_back[cal_type][:len_delta],:])*delta
        # print 'delta:',delta
        # print 'h_delta:',h_delta[self.update_look_back[cal_type][:len_delta]]
        # print 'embeddings:',embeddings[self.update_look_back[cal_type][:len_delta]]
        # print 'lmd_rda:',elem_lbd
        return h_delta, embeddings

    def update_vec_by_adam(self, cal_type, m, v, delta, embeddings, len_delta,
                           t):
        self.beta1 = .9
        self.beta2 = .999
        m[self.update_look_back[cal_type][:len_delta],:] = \
            self.beta1*m[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta1)*delta
        v[self.update_look_back[cal_type][:len_delta],:] = \
            self.beta2*v[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta2)*(delta**2)
        m_ = m[self.update_look_back[cal_type][:len_delta], :] / (
            1 - self.beta1**t)
        v_ = v[self.update_look_back[cal_type][:len_delta], :] / (
            1 - self.beta2**t)

        embeddings[
            self.update_look_back[cal_type][:len_delta], :] -= self.lr * m_ / (
                np.sqrt(v_) + self.epsilon)

        return m, v, embeddings

    def train_one_epoch(self):
        DISPLAY_EPOCH = 100
        batches = self.batch_iter()
        # opt_type = 'adagrad'
        opt_type = 'adam'
        for batch in batches:
            self.idx = defaultdict(int)
            self.update_look_back = defaultdict(list)
            self.update_dict = defaultdict(dict)
            delta_q, delta_f = self._update_graph(batch)
            len_delta_f = len(delta_f)
            # print 'order2, nd'
            if opt_type == 'adagrad':
                self.h_delta['node'], self.embeddings['node'] = \
                                    self.update_vec('node', self.h_delta['node'], delta_f
                                                    , self.embeddings['node'], len_delta_f, self.t)
            if opt_type == 'adam':
                self.m['node'], self.v['node'], self.embeddings['node'] = \
                                self.update_vec_by_adam('node', self.m['node'], self.v['node'], delta_f
                                                , self.embeddings['node'], len_delta_f, self.t)
            len_delta_q = len(delta_q)
            # print 'order2, content'
            if opt_type == 'adagrad':
                self.h_delta['content'], self.embeddings['content'] = \
                                    self.update_vec('content', self.h_delta['content'], delta_q
                                                    , self.embeddings['content'], len_delta_q, self.t)
            if opt_type == 'adam':
                self.m['content'], self.v['content'], self.embeddings['content'] = \
                                self.update_vec_by_adam('content', self.m['content'], self.v['content'], delta_q
                                                , self.embeddings['content'], len_delta_q, self.t)
            if (self.t - 1) % DISPLAY_EPOCH == 0:
                self.get_cur_batch_loss(self.t, batch)
            self.t += 1
        self.cur_epoch += 1

    def get_random_neighbor_nodes(self, nd_idx):
        graph = self.g.G
        look_up = self.g.look_up_dict
        look_back = self.g.look_back_list

        nd = self.g.look_back_list[nd_idx]
        neigh_nds = np.array([self.look_up[vid] for vid in graph[nd].keys()])
        shuffle_idx = np.random.permutation(np.arange(len(neigh_nds)))

        end_idx = self.negative_ratio if len(
            neigh_nds) > self.negative_ratio else len(neigh_nds)

        return neigh_nds[shuffle_idx[:end_idx]]

    def batch_iter(self):

        numNodes = self.node_size

        data_size = numNodes
        shuffle_indices = np.random.permutation(np.arange(data_size))

        start_index = 0
        end_index = min(start_index + self.batch_size, data_size)
        while start_index < data_size:
            ret = {}
            sp_nds = shuffle_indices[start_index:end_index]
            sp_neighbors = []
            for idx in sp_nds:
                neighbors = self.get_random_neighbor_nodes(idx)
                if len(neighbors) < self.negative_ratio:
                    neighbors = np.hstack(
                        (neighbors, -np.ones(self.negative_ratio -
                                             len(neighbors)))).astype(int)
                sp_neighbors.append(neighbors)
            ret = sp_nds, sp_neighbors

            start_index = end_index
            end_index = min(start_index + self.batch_size, data_size)

            yield ret

    def get_one_embeddings(self, embeddings):
        vectors = dict()
        look_back = self.g.look_back_list
        for i, embedding in enumerate(embeddings):
            if i == len(embeddings) - 1:
                continue
            vectors[look_back[i]] = embedding
        return vectors

    def get_vectors(self):
        order = self.order
        ret = dict()
        node_embeddings = self.get_one_embeddings(self.embeddings['node'])
        ret['node'] = node_embeddings
        content_embeddings = self.get_one_embeddings(
            self.embeddings['content'])
        ret['content'] = content_embeddings

        return ret
Esempio n. 13
0
class HALF_DP(object):
    def __init__(self, learning_rate, batch_size, neg_ratio, gamma, eta,
                 n_input, n_out, n_hidden, n_layer, type_model, is_valid,
                 device, files, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.device = device

        self.type_model = type_model

        # Parameters
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.neg_ratio = neg_ratio
        self.valid = is_valid
        self.valid_prop = .9 if self.valid else 1.
        self.valid_sample_size = 10

        self.gamma = gamma
        self.eta = eta

        self.cur_epoch = 1

        # Network Parameters
        self.n_hidden = n_hidden if type_model == 'mlp' else n_input  # number of neurons in hidden layer
        self.n_input = n_input  # size of node embeddings
        self.n_out = n_out  # hashing code
        self.n_layer = n_layer  # number of layer

        # Set Train Data
        if not isinstance(files, list) and len(files) < 3:
            self.logger.info(
                'The alogrihtm needs inputs: feature-src, feature-end, identity-linkage'
            )
            return

        # tf Graph input
        self.lookup = defaultdict(dict)
        self.look_back = defaultdict(list)
        self._read_train_dat(
            files)  # features from source, features from end, label file
        self.valid_sample_size = min(
            min(self.valid_sample_size,
                len(self.look_back['src']) - 1),
            len(self.look_back['end']) - 1)

        # TF Graph Building
        self.sess = tf.Session()
        cur_seed = random.getrandbits(32)
        initializer = tf.contrib.layers.xavier_initializer(uniform=False,
                                                           seed=cur_seed)
        with tf.device(self.device):
            with tf.variable_scope("model",
                                   reuse=None,
                                   initializer=initializer):
                self._init_weights()
                self.build_graph(type_model)
                self.build_valid_graph(type_model)
            self.sess.run(tf.global_variables_initializer())

    def _read_train_dat(self, files):
        self.F, self.lookup['src'], self.look_back['src'] = read_features(
            files['feat-src'])
        self.G, self.lookup['end'], self.look_back['end'] = read_features(
            files['feat-end'])
        self.L = load_train_valid_labels(files['linkage'], self.lookup,
                                         self.valid_prop)

    def _init_weights(self):
        # Store layers weight & bias
        self.weights = dict()
        self.biases = dict()
        if self.type_model == 'mlp':
            self.weights['h0_src'] = tf.Variable(
                tf.random_normal([self.n_input, self.n_hidden]))
            self.weights['h0_end'] = tf.Variable(
                tf.random_normal([self.n_input, self.n_hidden]))
            self.biases['b0_src'] = tf.Variable(tf.zeros([self.n_hidden]))
            self.biases['b0_end'] = tf.Variable(tf.zeros([self.n_hidden]))
            for i in range(1, self.n_layer):
                self.weights['h{}_src'.format(i)] = tf.Variable(
                    tf.random_normal([self.n_hidden, self.n_hidden]))
                self.weights['h{}_end'.format(i)] = tf.Variable(
                    tf.random_normal([self.n_hidden, self.n_hidden]))
                self.biases['b{}_src'.format(i)] = tf.Variable(
                    tf.zeros([self.n_hidden]))
                self.biases['b{}_end'.format(i)] = tf.Variable(
                    tf.zeros([self.n_hidden]))
        self.weights['out_src'] = tf.Variable(
            tf.random_normal([self.n_hidden, self.n_out]))
        self.weights['out_end'] = tf.Variable(
            tf.random_normal([self.n_hidden, self.n_out]))
        self.biases['b_out_src'] = tf.Variable(tf.zeros([self.n_out]))
        self.biases['b_out_end'] = tf.Variable(tf.zeros([self.n_out]))

    def build_lin_code_graph(self, inputs, tag):

        # Output fully connected layer with a neuron
        code = tf.nn.tanh(
            tf.matmul(tf.reshape(inputs, [-1, self.n_input]), self.weights[
                'out_' + tag]) + self.biases['b_out_' + tag])

        return code

    def build_mlp_code_graph(self, inputs, tag):

        # Input layer
        layer = tf.nn.sigmoid(
            tf.add(
                tf.matmul(tf.reshape(inputs, [-1, self.n_input]),
                          self.weights['h0_' + tag]),
                self.biases['b0_' + tag]))
        for i in range(1, self.n_layer):
            layer = tf.nn.sigmoid(
                tf.add(tf.matmul(layer, self.weights['h{}_{}'.format(i, tag)]),
                       self.biases['b{}_{}'.format(i, tag)]))
        # Output fully connected layer with a neuron
        code = tf.nn.tanh(
            tf.matmul(layer, self.weights['out_' + tag]) +
            self.biases['b_out_' + tag])

        return code

    def build_train_graph(self, src_tag, end_tag, code_graph):

        PF = code_graph(self.inputs_pos[src_tag], src_tag)  # batch_size*n_out
        PG = code_graph(self.inputs_pos[end_tag], end_tag)  # batch_size*n_out
        NF = tf.reshape(
            code_graph(self.inputs_neg[src_tag], src_tag),
            [-1, self.neg_ratio, self.n_out])  # batch_size*neg_ratio*n_out
        NG = tf.reshape(
            code_graph(self.inputs_neg[end_tag], end_tag),
            [-1, self.neg_ratio, self.n_out])  # batch_size*neg_ratio*n_out
        B = tf.sign(PF + PG)  # batch_size*n_out
        # self.ph['B'] = tf.sign(self.ph['F']+self.ph['G']) # batch_size*n_out

        # train loss
        term1_first = tf.log(
            tf.nn.sigmoid(tf.reduce_sum(tf.multiply(PF, PG), axis=1)))
        term1_second = tf.reduce_sum(
            tf.log(1 -
                   tf.nn.sigmoid(tf.reduce_sum(tf.multiply(NF, NG), axis=2))),
            axis=1)
        term1 = -tf.reduce_sum(term1_first + term1_second)
        term2 = tf.reduce_sum(tf.pow(
            (B - PF), 2)) + tf.reduce_sum(tf.pow((B - PG), 2))
        term3 = tf.reduce_sum(
            tf.pow(PF, 2) +
            tf.reduce_sum(tf.pow(NF, 2), axis=1)) + tf.reduce_sum(
                tf.pow(PG, 2) + tf.reduce_sum(tf.pow(NG, 2), axis=1))
        # term3 = tf.reduce_sum(tf.pow(tf.reduce_sum(PF,axis=1),2)+tf.reduce_sum(tf.pow(tf.reduce_sum(NF,axis=2),2),axis=1))\
        #         + tf.reduce_sum(tf.pow(tf.reduce_sum(PG,axis=1),2)+tf.reduce_sum(tf.pow(tf.reduce_sum(NG,axis=2),2),axis=1))
        # self.term1 = term1
        # self.term2 = term2
        # self.term3 = term3

        return (term1 + self.gamma * term2 +
                self.eta * term3) / self.cur_batch_size

    def build_graph(self, type_code_graph):
        self.cur_batch_size = tf.placeholder('float32', name='batch_size')

        self.inputs_pos = {
            'src': tf.placeholder('float32', [None, self.n_input]),
            'end': tf.placeholder('float32', [None, self.n_input])
        }
        self.inputs_neg = {
            'src': tf.placeholder('float32',
                                  [None, self.neg_ratio, self.n_input]),
            'end': tf.placeholder('float32',
                                  [None, self.neg_ratio, self.n_input])
        }

        if type_code_graph == 'lin':
            code_graph = self.build_lin_code_graph
        elif type_code_graph == 'mlp':
            code_graph = self.build_mlp_code_graph

        self.loss = (self.build_train_graph('src', 'end', code_graph) +
                     self.build_train_graph('end', 'src', code_graph)) / 2.

        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = optimizer.minimize(self.loss)

    def build_valid_graph(self, type_code_graph):

        # validation
        self.inputs_val = {
            'src':
            tf.placeholder('float32',
                           [None, self.valid_sample_size, self.n_input]),
            'end':
            tf.placeholder('float32',
                           [None, self.valid_sample_size, self.n_input])
        }

        if type_code_graph == 'lin':
            code_graph = self.build_lin_code_graph
        elif type_code_graph == 'mlp':
            code_graph = self.build_mlp_code_graph

        valids = {
            'src':
            tf.reshape(code_graph(self.inputs_val['src'], 'src'),
                       [-1, self.valid_sample_size, self.n_out
                        ]),  # batch_size*neg_ratio*n_out
            'end':
            tf.reshape(code_graph(self.inputs_val['end'], 'end'),
                       [-1, self.valid_sample_size, self.n_out
                        ])  # batch_size*neg_ratio*n_out 
        }

        # self.dot_dist = tf.reduce_sum(tf.multiply(valid_f, valid_g),axis=2)
        self.hamming_dist = -tf.reduce_sum(tf.clip_by_value(
            tf.sign(tf.multiply(valids['src'], valids['end'])), -1., 0.),
                                           axis=2)

    def train_one_epoch(self):
        sum_loss = 0.0
        mrr = 0.0

        # train process
        # print 'start training...'
        batches = batch_iter(self.L, self.batch_size, self.neg_ratio\
                                        , self.lookup, 'src', 'end')

        batch_id = 0
        for batch in batches:
            # training the process from source network to end network
            pos, neg = batch
            if not len(pos['src']) == len(pos['end']) and not len(
                    neg['src']) == len(neg['end']):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            batch_size = len(pos['src'])
            feed_dict = {
                self.inputs_pos['src']: self.F[pos['src'], :],
                self.inputs_pos['end']: self.G[pos['end'], :],
                self.inputs_neg['src']: self.F[neg['src'], :],
                self.inputs_neg['end']: self.G[neg['end'], :],
                self.cur_batch_size: batch_size
            }
            _, cur_loss = self.sess.run([self.train_op, self.loss], feed_dict)

            sum_loss += cur_loss
            batch_id += 1

        if self.valid:
            # valid process
            valid = valid_iter(self.L, self.valid_sample_size, self.lookup,
                               'src', 'end')
            # print valid_f,valid_g
            if not len(valid['src']) == len(valid['end']):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                return
            valid_size = len(valid['src'])
            feed_dict = {
                self.inputs_val['src']: self.F[valid['src'], :],
                self.inputs_val['end']: self.G[valid['end'], :],
            }
            # valid_dist = self.sess.run(self.dot_dist,feed_dict)
            valid_dist = self.sess.run(self.hamming_dist, feed_dict)
            mrr = .0
            for i in range(valid_size):
                fst_dist = valid_dist[i][0]
                pos = 1
                for k in range(1, len(valid_dist[i])):
                    if fst_dist >= valid_dist[i][k]:
                        pos += 1
                # print pos
                # self.logger.info('dist:{},pos:{}'.format(fst_dist,pos))
                # print valid_dist[i]
                mrr += 1. / pos
            self.logger.info('Epoch={}, sum of loss={!s}, mrr={}'.format(
                self.cur_epoch, sum_loss / batch_id / 2, mrr / valid_size))
        else:
            self.logger.info('Epoch={}, sum of loss={!s}'.format(
                self.cur_epoch, sum_loss / batch_id / 2))

        self.cur_epoch += 1

        # print(sum_loss/(batch_id+1e-8), mrr/(valid_size+1e-8))
        return sum_loss / (batch_id + 1e-8), mrr / (valid_size + 1e-8)

    def save_models(self, filename):
        if os.path.exists(filename):
            os.remove(filename)
        for k, v in self.weights.items():
            if self.type_model == 'lin':
                if 'out' not in k:
                    continue
            write_in_file(filename, v.eval(self.sess), k)
        for k, v in self.biases.items():
            if self.type_model == 'lin':
                if 'out' not in k:
                    continue
            write_in_file(filename, v.eval(self.sess), k)
Esempio n. 14
0
class _CROSSMNA(object):
    def __init__(self,
                 layer_graphs,
                 anchor_file,
                 lr=.001,
                 nd_rep_size=16,
                 layer_rep_size=16,
                 batch_size=100,
                 negative_ratio=5,
                 table_size=1e8,
                 log_file='log',
                 last_emb_file=None):

        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.epsilon = 1e-7
        self.table_size = table_size
        self.sigmoid_table = {}
        self.sigmoid_table_size = 1000
        self.SIGMOID_BOUND = 6

        self._init_simgoid_table()

        self.anchors, num_anchors = self._read_anchors(anchor_file, ',')
        self.logger.info('Number of anchors:%d' % num_anchors)

        self.num_layers = len(layer_graphs)  # number of calculated networks
        self.layer_graphs = layer_graphs  # graphs in different layers
        self.nd_rep_size = nd_rep_size  # representation size of node
        self.layer_rep_size = layer_rep_size  # representation size of layer

        self.idx = 0  # for speeding up calculation

        # self.node_size = 0
        # for i in range(self.num_layers):
        #     self.node_size += layer_graphs[i].node_size
        # self.node_size -= num_anchors
        # print(self.node_size)
        # may need to be revised
        self.update_dict = defaultdict(int)
        self.update_look_back = list()
        self._build_dict(layer_graphs, self.anchors)
        self.logger.info('Number of nodes:%d' % len(self.look_back))

        self.node_size = len(self.look_back)

        self._init_params(self.node_size, self.num_layers, nd_rep_size,
                          layer_rep_size, last_emb_file)

        self.lr = lr
        self.cur_epoch = 0
        self.batch_size = batch_size
        self.negative_ratio = negative_ratio

        self._gen_sampling_table()

    def _build_dict(self, layer_graphs, anchors):
        self.look_up = defaultdict(int)
        self.look_back = list()
        idx = 0
        for i in range(self.num_layers):
            for nd in layer_graphs[i].G.nodes():
                if nd in self.look_up:
                    continue
                if nd in self.anchors:
                    for ac_nd in self.anchors[nd]:
                        self.look_up[ac_nd] = idx
                self.look_up[nd] = idx
                self.look_back.append(nd)
                idx += 1

    def _init_params(self, node_size, n_layers, nd_rep_size, layer_rep_size,
                     last_emb_file):
        self.params = dict()
        self.params['node'] = np.random.normal(0, 1, (node_size, nd_rep_size))
        self.params['layer'] = np.random.normal(0, 1,
                                                (n_layers, layer_rep_size))
        self.params['W'] = np.random.normal(0, 1,
                                            (nd_rep_size, layer_rep_size))
        if last_emb_file:
            self.params['node'] = self._init_emb_matrix(self.params['node']\
                        , '{}.node'.format(last_emb_file))
            self.params['layer'] = self._init_emb_matrix(self.params['layer']\
                        , '{}.layer'.format(last_emb_file))
            self.params['W'] = self._init_emb_matrix(
                self.params['W'], '{}.W'.format(last_emb_file))
        # adagrad
        self.h_delta = dict()
        self.h_delta['node'] = np.zeros((node_size, nd_rep_size))
        self.h_delta['layer'] = np.zeros((n_layers, layer_rep_size))
        self.h_delta['W'] = np.zeros((nd_rep_size, layer_rep_size))
        # adam
        self.m = dict()
        self.m['node'] = np.zeros((node_size, nd_rep_size))
        self.m['layer'] = np.zeros((n_layers, layer_rep_size))
        self.m['W'] = np.zeros((nd_rep_size, layer_rep_size))
        self.v = dict()
        self.v['node'] = np.zeros((node_size, nd_rep_size))
        self.v['layer'] = np.zeros((n_layers, layer_rep_size))
        self.v['W'] = np.zeros((nd_rep_size, layer_rep_size))
        self.t = 1

    def _init_emb_matrix(self, emb, emb_file):
        with open(emb_file, 'r') as embed_handler:
            for ln in embed_handler:
                elems = ln.strip().split()
                if len(elems) <= 2:
                    continue
                emb[self.look_up[elems[0]]] = map(float, elems[1:])
        return emb

    def _read_anchors(self, anchor_file, delimiter):
        anchors = dict()
        num_anchors = 0
        with open(anchor_file, 'r') as anchor_handler:
            for ln in anchor_handler:
                elems = ln.strip().split(delimiter)
                for i in range(len(elems)):
                    elems[i] = '{}-{}'.format(i, elems[i])
                num_anchors += len(elems) - 1
                for k in range(len(elems)):
                    anchors[elems[k]] = elems[:k] + elems[k + 1:]
        return anchors, num_anchors

    def _init_simgoid_table(self):
        for k in range(self.sigmoid_table_size):
            x = 2 * self.SIGMOID_BOUND * k / self.sigmoid_table_size - self.SIGMOID_BOUND
            self.sigmoid_table[k] = 1. / (1 + np.exp(-x))

    def _fast_sigmoid(self, val):
        if val > self.SIGMOID_BOUND:
            return 1 - self.epsilon
        elif val < -self.SIGMOID_BOUND:
            return self.epsilon
        k = int((val + self.SIGMOID_BOUND) * self.sigmoid_table_size /
                self.SIGMOID_BOUND / 2)
        return self.sigmoid_table[k]
        # return 1./(1+np.exp(-val))

    def _calc_delta_vec(self, nd, delta, opt_vec):
        if nd not in self.update_dict:
            cur_idx = self.idx
            self.update_dict[nd] = cur_idx
            self.update_look_back.append(nd)
            self.idx += 1
        else:
            cur_idx = self.update_dict[nd]
        if cur_idx >= len(delta):
            for i in range(cur_idx - len(delta)):
                delta.append(np.zeros(opt_vec.shape))
            delta.append(opt_vec)
        else:
            delta[cur_idx] += opt_vec
        return delta

    def _update_intra_vec(self, batch):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        pos, neg = batch
        batch_size = len(pos['h'])

        # order 1
        pos_u = np.dot(
            self.params['node'][pos['h'], :],
            self.params['W']) + self.params['layer'][pos['h_layer'], :]
        pos_v = np.dot(
            self.params['node'][pos['t'], :],
            self.params['W']) + self.params['layer'][pos['t_layer'], :]
        neg_u = np.dot(
            self.params['node'][neg['h'], :],
            self.params['W']) + self.params['layer'][neg['h_layer'], :]
        neg_v = np.dot(
            self.params['node'][neg['t'], :],
            self.params['W']) + self.params['layer'][neg['t_layer'], :]

        pos_e = np.sum(pos_u * pos_v, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        # delta calculation
        delta_eh = list()
        delta_l = np.zeros(
            (self.num_layers, self.layer_rep_size))  ### problem in here ###

        idx = 0
        for i in range(len(pos['t'])):
            u, v = pos['h'][i], pos['t'][i]
            u_layer, v_layer = pos['h_layer'][i], pos['t_layer'][i]
            delta_eh = self._calc_delta_vec(
                v, delta_eh, (sigmoid_pos_e[i] - 1) * pos_u[i, :])
            delta_eh = self._calc_delta_vec(
                u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v[i, :])
            delta_l[v_layer] = (sigmoid_pos_e[i] - 1) * pos_u[i, :]
            delta_l[u_layer] = (sigmoid_pos_e[i] - 1) * pos_v[i, :]
        neg_shape = neg_e.shape
        for i in range(neg_shape[0]):
            for j in range(neg_shape[1]):
                u, v = neg['h'][i][j], neg['t'][i][j]
                u_layer, v_layer = neg['h_layer'][i][j], neg['t_layer'][i][j]
                delta_eh = self._calc_delta_vec(
                    v, delta_eh, sigmoid_neg_e[i, j] * neg_u[i, j, :])
                delta_eh = self._calc_delta_vec(
                    u, delta_eh, sigmoid_neg_e[i, j] * neg_v[i, j, :])
                delta_l[v_layer] = sigmoid_neg_e[i, j] * neg_u[i, j, :]
                delta_l[u_layer] = sigmoid_neg_e[i, j] * neg_v[i, j, :]

        delta_eh = np.array(delta_eh)
        delta_l = np.array(delta_l)

        # delta node, delta W, delta layer
        # print(self.params['node'][self.update_look_back,:].shape, delta_eh.shape)
        return np.dot(delta_eh, self.params['W'].T)/(batch_size*(1+self.negative_ratio))/2 \
                , np.dot(self.params['node'][self.update_look_back,:].T, delta_eh/batch_size) \
                , np.sum(delta_l, axis=0)/(batch_size*(1+self.negative_ratio))*self.num_layers

    def _get_loss(self, batch):
        pos, neg = batch
        batch_size = len(pos['h'])

        # order 1
        pos_u = np.dot(
            self.params['node'][pos['h'], :],
            self.params['W']) + self.params['layer'][pos['h_layer'], :]
        pos_v = np.dot(
            self.params['node'][pos['t'], :],
            self.params['W']) + self.params['layer'][pos['t_layer'], :]
        neg_u = np.dot(
            self.params['node'][neg['h'], :],
            self.params['W']) + self.params['layer'][neg['h_layer'], :]
        neg_v = np.dot(
            self.params['node'][neg['t'], :],
            self.params['W']) + self.params['layer'][neg['t_layer'], :]

        pos_e = np.sum(pos_u * pos_v, axis=1)  # pos_e.shape = batch_size
        neg_e = np.sum(neg_u * neg_v,
                       axis=2)  # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([
            self._fast_sigmoid(val) for val in pos_e.reshape(-1)
        ]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([
            self._fast_sigmoid(val) for val in neg_e.reshape(-1)
        ]).reshape(neg_e.shape)

        return -np.mean(
            np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1))

    def get_cur_batch_loss(self, t, batch):
        loss = self._get_loss(batch)

        self.logger.info('Finish processing batch {} and loss:{}'.format(
            t, loss))
        return loss

    def update_node_vec(self, h_delta, delta, embeddings, len_delta):
        h_delta[self.update_look_back[:len_delta], :] += delta**2
        # print 'original embedding:',embeddings[self.update_look_back[:len_delta]]
        embeddings[self.update_look_back[:len_delta],:] -= \
                                self.lr/np.sqrt(h_delta[self.update_look_back[:len_delta],:])*delta
        # print 'delta:',delta
        # print 'h_delta:',h_delta[self.update_look_back[:len_delta]]
        # print 'embeddings:',embeddings[self.update_look_back[:len_delta]]
        # print 'lmd_rda:',elem_lbd
        return h_delta, embeddings

    def update_vec(self, h_delta, delta, embeddings):
        h_delta += delta**2
        # print 'original embedding:',embeddings[self.update_look_back[:len_delta]]
        embeddings -= self.lr / np.sqrt(h_delta) * delta
        # print 'delta:',delta
        # print 'h_delta:',h_delta[self.update_look_back[:len_delta]]
        # print 'embeddings:',embeddings[self.update_look_back[:len_delta]]
        # print 'lmd_rda:',elem_lbd
        return h_delta, embeddings

    def update_node_vec_by_adam(self, m, v, delta, embeddings, t):
        self.beta1 = .9
        self.beta2 = .999
        m[self.update_look_back,:] = \
            self.beta1*m[self.update_look_back,:]+(1-self.beta1)*delta
        v[self.update_look_back,:] = \
            self.beta2*v[self.update_look_back,:]+(1-self.beta2)*(delta**2)
        m_ = m[self.update_look_back, :] / (1 - self.beta1**t)
        v_ = v[self.update_look_back, :] / (1 - self.beta2**t)

        embeddings[self.update_look_back, :] -= self.lr * m_ / (np.sqrt(v_) +
                                                                self.epsilon)

        return m, v, embeddings

    def update_vec_by_adam(self, m, v, delta, embeddings, t):
        self.beta1 = .9
        self.beta2 = .999
        m = self.beta1 * m + (1 - self.beta1) * delta
        v = self.beta2 * v + (1 - self.beta2) * (delta**2)
        m_ = m / (1 - self.beta1**t)
        v_ = v / (1 - self.beta2**t)

        embeddings -= self.lr * m_ / (np.sqrt(v_) + self.epsilon)

        return m, v, embeddings

    def train_one_epoch(self):
        DISPLAY_EPOCH = 1000

        opt_type = 'adam'
        loss = 0
        batches = self.batch_iter()
        for batch in batches:
            self.idx = 0
            self.update_look_back = list()
            self.update_dict = defaultdict(int)

            # delta node, delta W, delta layer
            delta_node, delta_W, delta_layer = self._update_intra_vec(batch)
            if opt_type == 'adagrad':
                self.h_delta['node'] = \
                        self.update_node_vec(self.h_delta['node'], delta_node, self.params['node'], len(delta_node))
                self.h_delta['W'] = self.update_vec(self.h_delta['W'], delta_W,
                                                    self.params['W'])
                self.h_delta['layer'] = self.update_vec(
                    self.h_delta['layer'], delta_layer, self.params['layer'])
            if opt_type == 'adam':
                self.m['node'], self.v['node'], self.params['node'] = \
                        self.update_node_vec_by_adam(self.m['node'], self.v['node'], delta_node
                            , self.params['node'], self.t)
                self.m['W'], self.v['W'], self.params['W'] = \
                        self.update_vec_by_adam(self.m['W'], self.v['W'], delta_W
                            , self.params['W'], self.t)
                self.m['layer'], self.v['layer'], self.params['layer'] = \
                        self.update_vec_by_adam(self.m['layer'], self.v['layer'], delta_layer
                            , self.params['layer'], self.t)

            if (self.t - 1) % DISPLAY_EPOCH == 0:
                loss += self.get_cur_batch_loss(self.t, batch)

            # print self.t, DISPLAY_EPOCH
            self.t += 1
        self.cur_epoch += 1

    def _get_nd_layer(self, idx):
        nd = self.look_back[idx]
        p = re.compile(r'(^\d+)-.*?')
        m = p.match(nd)
        if m:
            return int(m.group(1))
        return -1

    def layer_adjust(self, h_idx, t_idx):
        return self._get_nd_layer(h_idx) \
                if self._get_nd_layer(h_idx)>self._get_nd_layer(t_idx) \
                else self._get_nd_layer(t_idx)

    def get_random_node_pairs(self, i, shuffle_indices, edges, edge_set,
                              numNodes):
        # balance the appearance of edges according to edge_prob
        if not random.random() < self.edge_prob[shuffle_indices[i]]:
            shuffle_indices[i] = self.edge_alias[shuffle_indices[i]]
        pos = dict()
        pos['h'] = edges[shuffle_indices[i]][0]
        pos['t'] = edges[shuffle_indices[i]][1]
        pos['h_layer'] = self.layer_adjust(pos['h'], pos['t'])
        pos['t_layer'] = self.layer_adjust(pos['h'], pos['t'])
        head = pos['h'] * numNodes
        neg = defaultdict(list)
        # print(self.negative_ratio)
        for j in range(self.negative_ratio):
            rn = self.sampling_table[random.randint(0, self.table_size - 1)]
            # print(self.sampling_table)
            # print('rn:',rn)
            while head + rn in edge_set or pos['h'] == rn or rn in neg['t']:
                rn = self.sampling_table[random.randint(
                    0, self.table_size - 1)]
                # print('rn in iteration:',rn)
            # print(rn)
            neg['h'].append(pos['h'])
            neg['t'].append(rn)
            neg['h_layer'].append(self.layer_adjust(pos['h'], rn))
            neg['t_layer'].append(self.layer_adjust(pos['h'], rn))
        return pos, neg

    def batch_iter(self):

        edges = []
        for k in range(self.num_layers):
            g = self.layer_graphs[k]
            edges += [(self.look_up[x[0]], self.look_up[x[1]])
                      for x in g.G.edges()]

        data_size = len(edges)
        edge_set = set([x[0] * self.node_size + x[1] for x in edges])
        shuffle_indices = np.random.permutation(np.arange(data_size))

        start_index = 0
        end_index = min(start_index + self.batch_size, data_size)
        while start_index < data_size:
            ret = {}
            pos = defaultdict(list)
            neg = defaultdict(list)
            for i in range(start_index, end_index):
                cur_pos, cur_neg = self.get_random_node_pairs(
                    i, shuffle_indices, edges, edge_set, self.node_size)
                pos['h'].append(cur_pos['h'])
                pos['h_layer'].append(cur_pos['h_layer'])
                pos['t'].append(cur_pos['t'])
                pos['t_layer'].append(cur_pos['t_layer'])
                neg['h'].append(cur_neg['h'])
                neg['h_layer'].append(cur_neg['h_layer'])
                neg['t'].append(cur_neg['t'])
                neg['t_layer'].append(cur_neg['t_layer'])
            ret = (pos, neg)

            start_index = end_index
            end_index = min(start_index + self.batch_size, data_size)

            yield ret

    def _gen_sampling_table(self):
        table_size = self.table_size
        power = 0.75
        look_up = self.look_up
        numNodes = self.node_size

        print("Pre-procesing for non-uniform negative sampling!")
        node_degree = np.zeros(numNodes)  # out degree
        edges = []
        for k in range(self.num_layers):
            g = self.layer_graphs[k]
            edges += [(look_up[x[0]], look_up[x[1]], g.G[x[0]][x[1]]['weight'])
                      for x in g.G.edges()]
        # print(g.G.edges())
        # print('look_up',look_up)
        for edge in edges:
            node_degree[edge[0]] += edge[2]

        norm = sum([math.pow(node_degree[i], power) for i in range(numNodes)])

        self.sampling_table = np.zeros(int(table_size), dtype=np.uint32)

        # print(numNodes)
        # print(node_degree)
        p = 0
        i = 0
        for j in range(numNodes):
            p += float(math.pow(node_degree[j], power)) / norm
            while i < table_size and float(i) / table_size < p:
                self.sampling_table[i] = j
                i += 1
        # print(self.sampling_table)

        data_size = len(edges)
        self.edge_alias = np.zeros(data_size, dtype=np.int32)
        self.edge_prob = np.zeros(data_size, dtype=np.float32)
        large_block = np.zeros(data_size, dtype=np.int32)
        small_block = np.zeros(data_size, dtype=np.int32)

        total_sum = sum([edge[2] for edge in edges])
        norm_prob = [edge[2] * data_size / total_sum for edge in edges]
        num_small_block = 0
        num_large_block = 0
        cur_small_block = 0
        cur_large_block = 0
        for k in range(data_size - 1, -1, -1):
            if norm_prob[k] < 1:
                small_block[num_small_block] = k
                num_small_block += 1
            else:
                large_block[num_large_block] = k
                num_large_block += 1
        while num_small_block and num_large_block:
            num_small_block -= 1
            cur_small_block = small_block[num_small_block]
            num_large_block -= 1
            cur_large_block = large_block[num_large_block]
            self.edge_prob[cur_small_block] = norm_prob[cur_small_block]
            self.edge_alias[cur_small_block] = cur_large_block
            norm_prob[cur_large_block] = norm_prob[
                cur_large_block] + norm_prob[cur_small_block] - 1
            if norm_prob[cur_large_block] < 1:
                small_block[num_small_block] = cur_large_block
                num_small_block += 1
            else:
                large_block[num_large_block] = cur_large_block
                num_large_block += 1

        while num_large_block:
            num_large_block -= 1
            self.edge_prob[large_block[num_large_block]] = 1
        while num_small_block:
            num_small_block -= 1
            self.edge_prob[small_block[num_small_block]] = 1

    def get_one_embeddings(self, params):
        vectors = dict()
        look_back = self.look_back
        for i, param in enumerate(params):
            vectors[look_back[i]] = param
        return vectors

    def get_vectors(self):
        ret = dict()
        ret['node'] = self.get_one_embeddings(self.params['node'])
        ret['W'] = self.params['W']
        ret['layer'] = self.params['layer']

        return ret
Esempio n. 15
0
class _MNA(object):
    def __init__(self, graph, attr_file, anchorfile, use_net, valid_prop,
                 neg_ratio, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        if not isinstance(graph, dict):
            self.logger.error('The graph must contain src and target graphs.')
            return

        self.use_net = use_net
        self.graph = graph
        self.lookup = dict()
        self.lookup['f'] = self.graph['f'].look_up_dict
        self.lookup['g'] = self.graph['g'].look_up_dict
        self.look_back = dict()
        self.look_back['f'] = self.graph['f'].look_back_list
        self.look_back['g'] = self.graph['g'].look_back_list
        self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop)

        self.attributes = dict()
        if attr_file:
            self.attributes['f'] = self._set_node_attributes(attr_file[0])
            self.attributes['g'] = self._set_node_attributes(attr_file[1])

        self.neg_ratio = neg_ratio
        self.batch_size = 1024

        self.clf = svm.SVC(probability=True)

    def _set_node_attributes(self, attr_file):
        node_attributes = defaultdict(list)
        if not attr_file:
            return None
        with open(attr_file, 'r') as fin:
            for ln in fin:
                elems = ln.strip().split(',')
                node_attributes[elems[0]] = list(map(float, elems[1:]))
        return node_attributes

    def _get_pair_features(self, src_nds, target_nds):
        pair_features = list()
        if len(src_nds) != len(target_nds):
            self.logger.warn(
                'The size of sampling in processing _get_pair_features is not equal.'
            )
            yield pair_features
        for i in range(len(src_nds)):
            src_nd_idx, target_nd_idx = src_nds[i], target_nds[i]
            src_nd = self.look_back['f'][src_nd_idx]
            target_nd = self.look_back['g'][target_nd_idx]

            src_neighbor_anchors = set()
            for src_nd_to in self.graph['f'].G[src_nd]:
                if src_nd_to in self.L['f2g']['train']:
                    src_neighbor_anchors.add(src_nd_to)

            target_neighbor_anchors = set()
            for target_nd_to in self.graph['g'].G[target_nd]:
                if target_nd_to in self.L['g2f']['train']:
                    target_neighbor_anchors.add(target_nd_to)

            cnt_common_neighbors = .0
            AA_measure = .0
            for sna in src_neighbor_anchors:
                for k in range(len(self.L['f2g']['train'][sna])):
                    target_anchor_nd = self.L['f2g']['train'][sna][k]
                    if target_anchor_nd in target_neighbor_anchors:
                        cnt_common_neighbors += 1.
                        AA_measure += 1./np.log((len(self.graph['f'].G[sna])\
                                                +len(self.graph['g'].G[self.L['f2g']['train'][sna][k]]))/2.)
            jaccard = cnt_common_neighbors/(len(self.graph['f'].G[src_nd])\
                                            +len(self.graph['g'].G[target_nd])\
                                            -cnt_common_neighbors+1e-6)

            # print(self.attributes['f'][src_nd], self.attributes['g'][target_nd])
            feat_net = []
            feat_attr = []
            if self.use_net:
                feat_net = [cnt_common_neighbors, jaccard, AA_measure]
            if len(self.attributes) > 0:
                feat_len = len(self.attributes['f'][src_nd])
                feat_attr = [1-self.attributes['f'][src_nd][k]\
                                +self.attributes['g'][target_nd][k] for k in range(feat_len)]

            # print(len(feat_net), len(feat_attr))
            yield feat_net + feat_attr

    def train(self):

        batches_f2g = batch_iter(self.L, self.batch_size, self.neg_ratio,
                                 self.lookup, 'f', 'g')

        X = list()
        Y = list()
        for batch in batches_f2g:
            pos, neg = batch
            if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len(
                    neg['g']):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            pos_features = list(self._get_pair_features(pos['f'], pos['g']))
            # print('feat_len (pos):',len(pos_features[0]))
            X.extend(pos_features)
            Y.extend([1 for m in range(len(pos_features))])

            for k in range(self.neg_ratio):
                neg_features = list(
                    self._get_pair_features(neg['f'][k], neg['g'][k]))
                X.extend(neg_features)
                # print('feat_len (neg):',len(neg_features[0]))
                Y.extend([-1 for m in range(len(neg_features))])

            self.logger.info('Training Model...')
            print(len(X), len(X[0]), len(Y))
            self.clf.fit(X, Y)
            print(self.clf)
            self.logger.info('Training score: %f' % self.clf.score(X, Y))
            self.logger.info('Complete Training process...')
Esempio n. 16
0
# coding: utf-8
"""
------------------------------------------------------------
   File Name: TestLogHandler.py
   Description: Log operation test
   Author: JHao
   date: 2017/03/06
------------------------------------------------------------
   Change Activity:
                   2017/03/06: Log handler test
                   2017/09/21: Screen output/file output optional (default screen and file output)
------------------------------------------------------------
"""
__author__ = 'JHao'
from utils.LogHandler import LogHandler

log = LogHandler("log_test")
log.info("test_log_info")
Esempio n. 17
0
class DCNH_DP(object):
    def __init__(self, learning_rate, batch_size, neg_ratio, n_input, n_out,
                 n_hidden, n_layer, device, files, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.device = device

        # Parameters
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.neg_ratio = neg_ratio
        self.valid_prop = .9
        self.valid_sample_size = 9

        self.gamma = 1
        self.eta = 0

        self.cur_epoch = 1

        # Network Parameters
        self.n_hidden = n_hidden  # number of neurons in hidden layer
        self.n_input = n_input  # size of node embeddings
        self.n_out = n_out  # hashing code
        self.n_layer = n_layer  # number of layer

        # Set Train Data
        if not isinstance(files, list) and len(files) < 3:
            self.logger.info(
                'The alogrihtm needs files like [First Graph File, Second Graph File, Label File]'
            )
            return

        # tf Graph input
        self.lookup_f = dict()
        self.lookup_g = dict()
        self.look_back_f = list()
        self.look_back_g = list()
        self._read_train_dat(files[0], files[1],
                             files[2])  # douban, weibo, label files
        self.valid_sample_size = min(
            min(self.valid_sample_size,
                len(self.look_back_f) - 1),
            len(self.look_back_g) - 1)

        # TF Graph Building
        self.sess = tf.Session()
        cur_seed = random.getrandbits(32)
        initializer = tf.contrib.layers.xavier_initializer(uniform=False,
                                                           seed=cur_seed)
        with tf.device(self.device):
            with tf.variable_scope("model",
                                   reuse=None,
                                   initializer=initializer):
                self.mlp_weights()
                self.build_graph()
                self.build_valid_graph()
            self.sess.run(tf.global_variables_initializer())

    def _read_embeddings(self, embed_file, lookup, look_back):
        embedding = list()
        with open(embed_file, 'r') as emb_handler:
            idx = 0
            for ln in emb_handler:
                ln = ln.strip()
                if ln:
                    elems = ln.split()
                    if len(elems) == 2:
                        continue
                    embedding.append(map(float, elems[1:]))
                    lookup[elems[0]] = idx
                    look_back.append(elems[0])
                    idx += 1
        return np.array(embedding), lookup, look_back

    def _read_train_dat(self, embed1_file, embed2_file, label_file):
        self.L = load_train_valid_labels(label_file, self.valid_prop)
        self.F, self.lookup_f, self.look_back_f = self._read_embeddings(
            embed1_file, self.lookup_f, self.look_back_f)
        self.G, self.lookup_g, self.look_back_g = self._read_embeddings(
            embed2_file, self.lookup_g, self.look_back_g)

    def mlp_weights(self):
        # Store layers weight & bias
        self.weights = dict()
        self.biases = dict()
        self.weights['h0_f'] = tf.Variable(
            tf.random_normal([self.n_input, self.n_hidden]))
        self.weights['h0_g'] = tf.Variable(
            tf.random_normal([self.n_input, self.n_hidden]))
        self.biases['b0_f'] = tf.Variable(tf.zeros([self.n_hidden]))
        self.biases['b0_g'] = tf.Variable(tf.zeros([self.n_hidden]))
        for i in range(1, self.n_layer):
            self.weights['h{}_f'.format(i)] = tf.Variable(
                tf.random_normal([self.n_hidden, self.n_hidden]))
            self.weights['h{}_g'.format(i)] = tf.Variable(
                tf.random_normal([self.n_hidden, self.n_hidden]))
            self.biases['b{}_f'.format(i)] = tf.Variable(
                tf.zeros([self.n_hidden]))
            self.biases['b{}_g'.format(i)] = tf.Variable(
                tf.zeros([self.n_hidden]))
        self.weights['out_f'] = tf.Variable(
            tf.random_normal([self.n_hidden, self.n_out]))
        self.weights['out_g'] = tf.Variable(
            tf.random_normal([self.n_hidden, self.n_out]))
        self.biases['b_out_f'] = tf.Variable(tf.zeros([self.n_out]))
        self.biases['b_out_g'] = tf.Variable(tf.zeros([self.n_out]))

    def build_code_graph(self, inputs, tag):

        # Input layer
        layer = tf.nn.sigmoid(
            tf.add(
                tf.matmul(tf.reshape(inputs, [-1, self.n_input]),
                          self.weights['h0_' + tag]),
                self.biases['b0_' + tag]))
        for i in range(1, self.n_layer):
            layer = tf.nn.sigmoid(
                tf.add(tf.matmul(layer, self.weights['h{}'.format(i)]),
                       self.biases['b{}'.format(i)]))
        # Output fully connected layer with a neuron
        code = tf.nn.tanh(
            tf.matmul(layer, self.weights['out']) + self.biases['b_out'])

        return code

    def build_train_graph(self, src_tag, obj_tag):

        PF = self.build_code_graph(self.pos_src_inputs,
                                   src_tag)  # batch_size*n_out
        PG = self.build_code_graph(self.pos_obj_inputs,
                                   obj_tag)  # batch_size*n_out
        NF = tf.reshape(
            self.build_code_graph(self.neg_src_inputs, src_tag),
            [-1, self.neg_ratio, self.n_out])  # batch_size*neg_ratio*n_out
        NG = tf.reshape(
            self.build_code_graph(self.neg_obj_inputs, obj_tag),
            [-1, self.neg_ratio, self.n_out])  # batch_size*neg_ratio*n_out
        B = tf.sign(PF + PG)  # batch_size*n_out
        # self.ph['B'] = tf.sign(self.ph['F']+self.ph['G']) # batch_size*n_out

        # train loss
        term1_first = tf.log(
            tf.nn.sigmoid(tf.reduce_sum(.5 * tf.multiply(PF, PG), axis=1)))
        term1_second = tf.reduce_sum(tf.log(
            1 -
            tf.nn.sigmoid(tf.reduce_sum(.5 * tf.multiply(NF, NG), axis=2))),
                                     axis=1)
        term1 = -tf.reduce_sum(term1_first + term1_second)
        term2 = tf.reduce_sum(tf.pow(
            (B - PF), 2)) + tf.reduce_sum(tf.pow((B - PG), 2))
        term3 = tf.reduce_sum(
            tf.reduce_sum(tf.pow(PF, 2)) +
            tf.reduce_sum(tf.pow(PG, 2), axis=1))
        # term1 = -tf.reduce_sum(tf.multiply(self.ph['S'], theta)-tf.log(1+tf.exp(theta)))
        # term2 = tf.reduce_sum(tf.norm(self.ph['B']-self.ph['F'],axis=1))+tf.reduce_sum(tf.norm(self.ph['B']-self.ph['G'],axis=1))
        # term3 = tf.reduce_sum(tf.norm(self.ph['F'],axis=1))+tf.reduce_sum(tf.norm(self.ph['G'],axis=1))

        return (term1 + self.gamma * term2 +
                self.eta * term3) / self.cur_batch_size

    def build_graph(self):
        self.cur_batch_size = tf.placeholder('float32', name='batch_size')

        self.pos_src_inputs = tf.placeholder('float32', [None, self.n_input])
        self.pos_obj_inputs = tf.placeholder('float32', [None, self.n_input])
        self.neg_src_inputs = tf.placeholder(
            'float32', [None, self.neg_ratio, self.n_input])
        self.neg_obj_inputs = tf.placeholder(
            'float32', [None, self.neg_ratio, self.n_input])

        self.loss_f2g = self.build_train_graph('f', 'g')
        self.loss_g2f = self.build_train_graph('g', 'f')
        # self.loss = (term1+self.eta*term3)/self.cur_batch_size
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op_f2g = optimizer.minimize(self.loss_f2g)
        self.train_op_g2f = optimizer.minimize(self.loss_g2f)

    def build_valid_graph(self):

        # validation
        self.valid_f_inputs = tf.placeholder(
            'float32', [None, self.valid_sample_size, self.n_input])
        self.valid_g_inputs = tf.placeholder(
            'float32', [None, self.valid_sample_size, self.n_input])

        valid_f = tf.reshape(self.build_code_graph(self.valid_f_inputs, 'f'),
                             [-1, self.valid_sample_size, self.n_out
                              ])  # batch_size*neg_ratio*n_out
        valid_g = tf.reshape(self.build_code_graph(self.valid_g_inputs, 'g'),
                             [-1, self.valid_sample_size, self.n_out
                              ])  # batch_size*neg_ratio*n_out
        # self.dot_dist = tf.reduce_sum(tf.multiply(valid_f, valid_g),axis=2)
        self.hamming_dist = -tf.reduce_sum(tf.clip_by_value(
            tf.sign(tf.multiply(valid_f, valid_g)), -1., 0.),
                                           axis=2)

    def train_one_epoch(self):
        sum_loss = 0.0

        # train process
        batches_f2g = list(batch_iter(self.L, self.batch_size, self.neg_ratio\
                , self.lookup_f, self.lookup_g, 'f', 'g'))
        batches_g2f = list(batch_iter(self.L, self.batch_size, self.neg_ratio\
                , self.lookup_g, self.lookup_f, 'g', 'f'))
        n_batches = min(len(batches_f2g), len(batches_g2f))
        batch_id = 0
        for i in range(n_batches):
            # training the process from network f to network g
            pos_src_f2g, pos_obj_f2g, neg_src_f2g, neg_obj_f2g = batches_f2g[i]
            if not len(pos_src_f2g) == len(pos_obj_f2g) and not len(
                    neg_src_f2g) == len(neg_obj_f2g):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            batch_size_f2g = len(pos_src_f2g)
            feed_dict = {
                self.pos_src_inputs: self.F[pos_src_f2g, :],
                self.pos_obj_inputs: self.G[pos_obj_f2g, :],
                self.neg_src_inputs: self.F[neg_src_f2g, :],
                self.neg_obj_inputs: self.G[neg_obj_f2g, :],
                self.cur_batch_size: batch_size_f2g
            }
            _, cur_loss_f2g = self.sess.run([self.train_op_f2g, self.loss_f2g],
                                            feed_dict)

            sum_loss += cur_loss_f2g

            # training the process from network g to network f
            pos_src_g2f, pos_obj_g2f, neg_src_g2f, neg_obj_g2f = batches_g2f[i]
            if not len(pos_src_g2f) == len(pos_obj_g2f) and not len(
                    neg_src_g2f) == len(neg_obj_g2f):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            batch_size_g2f = len(pos_src_g2f)
            feed_dict = {
                self.pos_src_inputs: self.G[pos_src_g2f, :],
                self.pos_obj_inputs: self.F[pos_obj_g2f, :],
                self.neg_src_inputs: self.G[neg_src_g2f, :],
                self.neg_obj_inputs: self.F[neg_obj_g2f, :],
                self.cur_batch_size: batch_size_g2f
            }
            _, cur_loss_g2f = self.sess.run([self.train_op_g2f, self.loss_g2f],
                                            feed_dict)

            sum_loss += cur_loss_g2f

            batch_id += 1
            break

        # valid process
        valid_f, valid_g = valid_iter(self.L, self.valid_sample_size,
                                      self.lookup_f, self.lookup_g, 'f', 'g')
        # print valid_f,valid_g
        if not len(valid_f) == len(valid_g):
            self.logger.info(
                'The input label file goes wrong as the file format.')
            return
        valid_size = len(valid_f)
        feed_dict = {
            self.valid_f_inputs: self.F[valid_f, :],
            self.valid_g_inputs: self.G[valid_g, :],
        }
        # valid_dist = self.sess.run(self.dot_dist,feed_dict)
        valid_dist = self.sess.run(self.hamming_dist, feed_dict)
        mrr = .0
        for i in range(valid_size):
            fst_dist = valid_dist[i][0]
            pos = 1
            for k in range(1, len(valid_dist[i])):
                if fst_dist >= valid_dist[i][k]:
                    pos += 1
            # print pos
            # self.logger.info('dist:{},pos:{}'.format(fst_dist,pos))
            # print valid_dist[i]
            mrr += 1. / pos
        self.logger.info('Epoch={}, sum of loss={!s}, mrr={}'.format(
            self.cur_epoch, sum_loss / batch_id / 2, mrr / valid_size))
        # print 'mrr:',mrr/valid_size
        # self.logger.info('Epoch={}, sum of loss={!s}, valid_loss={}'
        #                     .format(self.cur_epoch, sum_loss/batch_id, valid_loss))
        self.cur_epoch += 1

    def _write_in_file(self, filename, vec, tag):
        with open(filename, 'aw') as res_handler:
            if len(vec.shape) > 1:
                column_size = vec.shape[1]
            else:
                column_size = 1
            reshape_vec = vec.reshape(-1)
            vec_size = len(reshape_vec)
            res_handler.write(tag + '\n')
            for i in range(0, vec_size, column_size):
                res_handler.write('{}\n'.format(' '.join(
                    [str(reshape_vec[i + k]) for k in range(column_size)])))

    def save_models(self, filename):
        if os.path.exists(filename):
            os.remove(filename)
        for k, v in self.weights.iteritems():
            self._write_in_file(filename, v.eval(self.sess), k)
        for k, v in self.biases.iteritems():
            self._write_in_file(filename, v.eval(self.sess), k)
Esempio n. 18
0
def main(args):
    t1 = time.time()

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id

    # args.use_net=False
    logger = LogHandler('RUN.' +
                        time.strftime('%Y-%m-%d', time.localtime(time.time())))
    logger.info(args)

    SAVING_STEP = args.saving_step
    MAX_EPOCHS = args.epochs
    if args.method == 'pale':
        model = PALE(learning_rate=args.lr,
                     batch_size=args.batch_size,
                     n_input=args.input_size,
                     n_hidden=args.hidden_size,
                     n_layer=args.layers,
                     files=args.embeddings + args.identity_linkage,
                     type_model=args.type_model,
                     is_valid=args.is_valid,
                     log_file=args.log_file,
                     device=args.device)
        losses = np.zeros(MAX_EPOCHS)
        val_scrs = np.zeros(MAX_EPOCHS)
        best_scr = .0
        best_epoch = 0
        thres = 100
        for i in range(1, MAX_EPOCHS + 1):
            losses[i - 1], val_scrs[i - 1] = model.train_one_epoch()
            if i > 0 and i % SAVING_STEP == 0:
                loss_mean = np.mean(losses[i - SAVING_STEP:i])
                scr_mean = np.mean(val_scrs[i - SAVING_STEP:i])
                logger.info(
                    'loss in last {} epoches: {}, validation in last {} epoches: {}'
                    .format(SAVING_STEP, loss_mean, SAVING_STEP, scr_mean))
                if scr_mean > best_scr:
                    best_scr = scr_mean
                    best_epoch = i
                    model.save_models(args.output)
                if args.early_stop and i >= thres * SAVING_STEP:
                    cnt = 0
                    for k in range(thres - 1, -1, -1):
                        cur_val = np.mean(
                            val_scrs[i - (k + 1) * SAVING_STEP:i -
                                     k * SAVING_STEP])
                        if cur_val < best_scr:
                            cnt += 1
                    if cnt == thres and (i -
                                         best_epoch) >= thres * SAVING_STEP:
                        logger.info('*********early stop*********')
                        logger.info(
                            'The best epoch: {}\nThe validation score: {}'.
                            format(best_epoch, best_scr))
                        break
    if args.method == 'mna' or args.method == 'fruip':
        graph = defaultdict(Graph)
        print("Loading graph...")
        if len(args.graphs) != 2:
            logger.error('#####The input graphs must be pairwise!#####')
            sys.exit(1)
        if args.graph_format == 'adjlist':
            if args.graphs[0]:
                graph['f'].read_adjlist(filename=args.graphs[0])
            if args.graphs[1]:
                graph['g'].read_adjlist(filename=args.graphs[1])
        if args.graph_format == 'edgelist':
            if args.graphs[0]:
                graph['f'].read_edgelist(filename=args.graphs[0])
            if args.graphs[1]:
                graph['g'].read_edgelist(filename=args.graphs[1])

        if args.method == 'mna':
            model = MNA(graph=graph, attr_file=args.embeddings, anchorfile=args.identity_linkage, valid_prop=1.\
                        , use_net=args.use_net, neg_ratio=args.neg_ratio, log_file=args.log_file)
        if args.method == 'fruip':
            model = FRUIP(graph=graph,
                          embed_files=args.embeddings,
                          linkage_file=args.identity_linkage)
            model.main_proc(args.threshold)
    if args.method == 'final':
        main_proc(graph_files=args.graphs,
                  graph_sizes=args.graph_sizes,
                  linkage_file=args.identity_linkage,
                  alpha=args.alpha,
                  epoch=args.epochs,
                  tol=args.tol,
                  graph_format=args.graph_format,
                  output_file=args.output)
    if args.method == 'crossmna':
        num_graphs = len(args.graphs)
        layer_graphs = [Graph() for i in range(num_graphs)]
        for k in range(num_graphs):
            graph_path = args.graphs[k]
            format_graph_path = '{}.crossmna'.format(graph_path)
            format_crossmna_graph(graph_path, format_graph_path, k)
            if args.graph_format == 'adjlist':
                layer_graphs[k].read_adjlist(filename=format_graph_path)
            if args.graph_format == 'edgelist':
                layer_graphs[k].read_edgelist(filename=format_graph_path)
        model = CROSSMNA(layer_graphs=layer_graphs,
                         anchor_file=args.identity_linkage,
                         lr=args.lr,
                         batch_size=args.batch_size,
                         nd_rep_size=args.nd_rep_size,
                         layer_rep_size=args.layer_rep_size,
                         epoch=args.epochs,
                         negative_ratio=args.neg_ratio,
                         table_size=args.table_size,
                         outfile=args.output,
                         log_file=args.log_file)
    if args.method in ['mna', 'fruip', 'pale']:
        model.save_model(args.output)
    t2 = time.time()
    print('time cost:', t2 - t1)
Esempio n. 19
0
class PALE(object):
    def __init__(self, learning_rate, batch_size, n_input, n_hidden, n_layer,
                 type_model, is_valid, device, files, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.device = device

        # Parameters
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.valid = is_valid
        self.valid_prop = .9 if self.valid else 1.
        self.valid_sample_size = 9

        self.cur_epoch = 1

        # Network Parameters
        self.n_hidden = n_hidden if type_model == 'mlp' else n_input  # number of neurons in hidden layer
        self.n_input = n_input  # size of node embeddings
        self.n_layer = n_layer  # number of layer

        # Set Train Data
        if not isinstance(files, list) and len(files) < 3:
            self.logger.info(
                'The alogrihtm needs files like [First Graph File, Second Graph File, Label File]'
            )
            return

        # tf Graph input
        self.lookup = defaultdict(dict)
        self.look_back = defaultdict(list)
        self._read_train_dat(files[0], files[1],
                             files[2])  # douban, weibo, label files
        self.valid_sample_size = min(
            min(self.valid_sample_size,
                len(self.look_back['f']) - 1),
            len(self.look_back['g']) - 1)

        # TF Graph Building
        self.sess = tf.Session()
        cur_seed = random.getrandbits(32)
        initializer = tf.contrib.layers.xavier_initializer(uniform=False,
                                                           seed=cur_seed)
        with tf.device(self.device):
            with tf.variable_scope("model",
                                   reuse=None,
                                   initializer=initializer):
                self._init_weights(type_model)
                self.build_train_graph(type_model)
                self.build_valid_graph(type_model)
            self.sess.run(tf.global_variables_initializer())

    def _read_labels(self, label_file):
        labels = list()
        with open(label_file, 'r') as lb_handler:
            for ln in lb_handler:
                ln = ln.strip()
                if not ln:
                    break
                labels.append(ln.split())
        return labels

    def _read_train_dat(self, embed1_file, embed2_file, label_file):
        self.X, self.lookup['f'], self.look_back['f'] = read_embeddings(
            embed1_file)
        self.Y, self.lookup['g'], self.look_back['g'] = read_embeddings(
            embed2_file)

        self.L = load_train_valid_labels(label_file, self.lookup,
                                         self.valid_prop)

    def _init_weights(self, type_code_graph):
        # Store layers weight & bias
        self.weights = dict()
        self.biases = dict()
        if type_code_graph == 'mlp':
            self.weights['h0'] = tf.Variable(
                tf.random_normal([self.n_input, self.n_hidden]))
            self.biases['b0'] = tf.Variable(tf.zeros([self.n_hidden]))
            for i in range(1, self.n_layer):
                self.weights['h{}'.format(i)] = tf.Variable(
                    tf.random_normal([self.n_hidden, self.n_hidden]))
                self.biases['b{}'.format(i)] = tf.Variable(
                    tf.zeros([self.n_hidden]))
        self.weights['out'] = tf.Variable(
            tf.random_normal([self.n_hidden, self.n_input]))
        self.biases['b_out'] = tf.Variable(tf.zeros([self.n_input]))

    def build_mlp_code_graph(self, inputs):

        # Input layer
        layer = tf.nn.sigmoid(
            tf.add(
                tf.matmul(tf.reshape(inputs, [-1, self.n_input]),
                          self.weights['h0']), self.biases['b0']))
        for i in range(1, self.n_layer):
            layer = tf.nn.sigmoid(
                tf.add(tf.matmul(layer, self.weights['h{}'.format(i)]),
                       self.biases['b{}'.format(i)]))
        # Output fully connected layer with a neuron
        code = tf.nn.tanh(
            tf.matmul(layer, self.weights['out']) + self.biases['b_out'])

        return code

    def build_lin_code_graph(self, inputs):

        # Output fully connected layer with a neuron
        code = tf.matmul(tf.reshape(inputs, [-1, self.n_input]),
                         self.weights['out']) + self.biases['b_out']

        return code

    def build_train_graph(self, type_code_graph):

        if type_code_graph == 'lin':
            code_graph = self.build_lin_code_graph
        elif type_code_graph == 'mlp':
            code_graph = self.build_mlp_code_graph

        self.cur_batch_size = tf.placeholder('float32', name='batch_size')

        self.pos_inputs = {
            'f': tf.placeholder('float32', [None, self.n_input]),
            'g': tf.placeholder('float32', [None, self.n_input])
        }

        self.PF = code_graph(self.pos_inputs['f'])  # batch_size*n_input

        # train loss
        self.loss = tf.reduce_mean(.5 *
                                   tf.square(self.PF - self.pos_inputs['g']))

        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = optimizer.minimize(self.loss)

    def build_valid_graph(self, type_code_graph):

        if type_code_graph == 'lin':
            code_graph = self.build_lin_code_graph
        elif type_code_graph == 'mlp':
            code_graph = self.build_mlp_code_graph

        # validation
        self.valid_inputs = {
            'f':
            tf.placeholder('float32',
                           [None, self.valid_sample_size, self.n_input]),
            'g':
            tf.placeholder('float32',
                           [None, self.valid_sample_size, self.n_input])
        }

        valid = tf.reshape(code_graph(self.valid_inputs['f']),
                           [-1, self.valid_sample_size, self.n_input
                            ])  # batch_size*neg_ratio*n_input
        self.dot_dist = tf.reduce_sum(tf.pow(valid - self.valid_inputs['g'],
                                             2.),
                                      axis=2)

    def train_one_epoch(self):
        sum_loss = 0.0
        mrr = 0.0

        # train process
        batches = batch_iter(self.L, self.batch_size, 0, self.lookup, 'f', 'g')
        batch_id = 0
        for batch in batches:
            pos, neg = batch
            if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len(
                    neg['g']):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            batch_size = len(pos['f'])
            feed_dict = {
                self.pos_inputs['f']: self.X[pos['f'], :],
                self.pos_inputs['g']: self.Y[pos['g'], :],
                self.cur_batch_size: batch_size
            }
            _, cur_loss = self.sess.run([self.train_op, self.loss], feed_dict)

            sum_loss += cur_loss
            batch_id += 1

        # valid process
        if self.valid:
            valid = valid_iter(self.L, self.valid_sample_size, self.lookup,
                               'f', 'g')
            if not len(valid['f']) == len(valid['g']):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                return
            valid_size = len(valid['f'])
            feed_dict = {
                self.valid_inputs['f']: self.X[valid['f'], :],
                self.valid_inputs['g']: self.Y[valid['g'], :]
            }
            valid_dist = self.sess.run(self.dot_dist, feed_dict)

            mrr = .0
            for i in range(valid_size):
                fst_dist = valid_dist[i][0]
                pos = 1
                for k in range(1, len(valid_dist[i])):
                    if fst_dist >= valid_dist[i][k]:
                        pos += 1
                mrr += 1. / pos
            self.logger.info(
                'Epoch={}, sum of loss={!s}, mrr in validation={}'.format(
                    self.cur_epoch, sum_loss / (batch_id + 1e-8),
                    mrr / (valid_size + 1e-8)))
        else:
            self.logger.info('Epoch={}, sum of loss={!s}'.format(
                self.cur_epoch, sum_loss / batch_id))
        self.cur_epoch += 1

        return sum_loss / (batch_id + 1e-8), mrr / (valid_size + 1e-8)

    def _write_in_file(self, filename, vec, tag):
        with open(filename, 'a+') as res_handler:
            if len(vec.shape) > 1:
                column_size = vec.shape[1]
            else:
                column_size = 1
            reshape_vec = vec.reshape(-1)
            vec_size = len(reshape_vec)
            res_handler.write(tag + '\n')
            for i in range(0, vec_size, column_size):
                res_handler.write('{}\n'.format(' '.join(
                    [str(reshape_vec[i + k]) for k in range(column_size)])))

    def save_models(self, filename):
        if os.path.exists(filename):
            os.remove(filename)
        for k, v in self.weights.items():
            self._write_in_file(filename, v.eval(self.sess), k)
        for k, v in self.biases.items():
            self._write_in_file(filename, v.eval(self.sess), k)
Esempio n. 20
0
import threading
import requests
from apscheduler.schedulers.blocking import BlockingScheduler
import os
import sys

# 解决引用问题,添加上上级项目目录,单文件可运行
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(os.path.split(curPath)[0])[0]
sys.path.append(rootPath)

from models.dataBase.Redis import rds
from utils.request import getHtmlTree, ex_request
from utils.LogHandler import LogHandler

log1 = LogHandler('proxyScheduler')
log2 = LogHandler('usefulProxies')


class GetFreeProxy(object):
    """
    参考https://github.com/jhao104/proxy_pool
    扒出来爬虫和检测代码,重写了逻辑
    用BlockingScheduler实现定时任务,爬取代理存入proxies,检验可用存入useful_proxies
    """

    @staticmethod
    def freeProxy01():
        """
        无忧代理 http://www.data5u.com/
        几乎没有能用的
Esempio n. 21
0
class _MNA(object):
    def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        if not isinstance(graph, dict):
            self.logger.error('The graph must contain src and target graphs.')
            return

        self.L = load_train_valid_labels(anchorfile, valid_prop)
        self.graph = graph
        self.look_up = dict()
        self.look_up['f'] = self.graph['f'].look_up_dict
        self.look_up['g'] = self.graph['g'].look_up_dict
        self.look_back = dict()
        self.look_back['f'] = self.graph['f'].look_back_list
        self.look_back['g'] = self.graph['g'].look_back_list

        self.neg_ratio = neg_ratio
        self.batch_size = 1024

        self.clf = svm.SVC()

    def __get_pair_features(self, src_nds, target_nds):
        pair_features = list()
        if len(src_nds) != len(target_nds):
            self.logger.warn(
                'The size of sampling in processing __get_pair_features is not equal.'
            )
            yield pair_features
        for i in range(len(src_nds)):
            src_nd, target_nd = src_nds[i], target_nds[i]

            if not src_nd in self.graph['f'].G or not target_nd in self.graph[
                    'g'].G:
                continue

            src_neighbor_anchors = set()
            for src_nd_to in self.graph['f'].G[src_nd]:
                if src_nd_to in self.L['f2g']['train']:
                    src_neighbor_anchors.add(src_nd_to)

            target_neighbor_anchors = set()
            for target_nd_to in self.graph['g'].G[target_nd]:
                if target_nd_to in self.L['g2f']['train']:
                    target_neighbor_anchors.add(target_nd_to)

            cnt_common_neighbors = .0
            AA_measure = .0
            for sna in src_neighbor_anchors:
                for k in range(len(self.L['f2g']['train'][sna])):
                    target_anchor_nd = self.L['f2g']['train'][sna][k]
                    if target_anchor_nd in target_neighbor_anchors:
                        cnt_common_neighbors += 1.
                        AA_measure += 1. / np.log(
                            (len(self.graph['f'].G[sna]) + len(self.graph[
                                'g'].G[self.L['f2g']['train'][sna][k]])) / 2.)
            jaccard = cnt_common_neighbors/(len(self.graph['f'].G[src_nd])\
                    +len(self.graph['g'].G[target_nd])-cnt_common_neighbors+1e-6)

            yield [cnt_common_neighbors, jaccard, AA_measure]

    def __batch_iter(self, lbs, batch_size, neg_ratio, lookup_src, lookup_obj,
                     src_lb_tag, obj_lb_tag):
        train_lb_src2obj = lbs['{}2{}'.format(src_lb_tag, obj_lb_tag)]['train']
        train_lb_obj2src = lbs['{}2{}'.format(obj_lb_tag, src_lb_tag)]['train']
        train_size = len(train_lb_src2obj)
        start_index = 0
        end_index = min(start_index + batch_size, train_size)

        src_lb_keys = train_lb_src2obj.keys()
        obj_lb_keys = train_lb_obj2src.keys()
        shuffle_indices = np.random.permutation(np.arange(train_size))
        while start_index < end_index:
            pos_src = list()
            pos_obj = list()
            neg_src = list()
            neg_obj = list()
            for i in range(start_index, end_index):
                idx = shuffle_indices[i]
                src_lb = src_lb_keys[idx]
                obj_lbs = train_lb_src2obj[src_lb]
                for obj_lb in obj_lbs:
                    cur_neg_src = list()
                    cur_neg_obj = list()
                    for k in range(neg_ratio):
                        rand_obj_lb = None
                        while not rand_obj_lb or rand_obj_lb in cur_neg_obj or rand_obj_lb in obj_lbs:
                            rand_obj_lb_idx = random.randint(
                                0,
                                len(obj_lb_keys) - 1)
                            rand_obj_lb = obj_lb_keys[rand_obj_lb_idx]
                        cur_neg_src.append(src_lb)
                        cur_neg_obj.append(rand_obj_lb)
                    pos_src.append(src_lb)
                    pos_obj.append(obj_lb)
                    neg_src.append(cur_neg_src)
                    neg_obj.append(cur_neg_obj)

            start_index = end_index
            end_index = min(start_index + batch_size, train_size)

            yield pos_src, pos_obj, neg_src, neg_obj

    def train(self):

        batches_f2g = list(self.__batch_iter(self.L, self.batch_size, self.neg_ratio\
              , self.look_up['f'], self.look_up['g'], 'f', 'g'))
        n_batches = len(batches_f2g)

        X = list()
        Y = list()
        for i in range(n_batches):
            pos_src_f2g, pos_obj_f2g, neg_src_f2g, neg_obj_f2g = batches_f2g[i]
            if not len(pos_src_f2g) == len(pos_obj_f2g) and not len(
                    neg_src_f2g) == len(neg_obj_f2g):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            pos_features = list(
                self.__get_pair_features(pos_src_f2g, pos_obj_f2g))
            X.extend(pos_features)
            Y.extend([1 for m in range(len(pos_features))])

            for k in range(self.neg_ratio):
                neg_features = list(
                    self.__get_pair_features(neg_src_f2g[k], neg_obj_f2g[k]))
                X.extend(neg_features)
                Y.extend([-1 for m in range(len(neg_features))])

            self.logger.info('Training Model...')
            self.clf.fit(X, Y)
            self.logger.info('Complete Training process...')
Esempio n. 22
0
class _ALP_NE(object):

    def __init__(self, graphs, lr=.001, gamma=.1, rep_size=128, batch_size=100, negative_ratio=5, table_size=1e8,
                    anchor_file=None, log_file='log', last_emb_files=dict()):

        if os.path.exists('log/'+log_file+'.log'):
            os.remove('log/'+log_file+'.log')
        self.logger = LogHandler(log_file)

        self.epsilon = 1e-7
        self.table_size = table_size
        self.sigmoid_table = {}
        self.sigmoid_table_size = 1000
        self.SIGMOID_BOUND = 6

        self._init_simgoid_table()

        self._init_dicts()
        self.t = 1
        self.rep_size = rep_size
        for graph_type in ['f', 'g']:
            self.g[graph_type] = graphs[graph_type]
            self.look_up[graph_type] = self.g[graph_type].look_up_dict
            self.idx[graph_type] = 0
            self.update_dict[graph_type] = dict()
            self.update_look_back[graph_type] = list()
            self.node_size[graph_type] = self.g[graph_type].node_size
            self.embeddings[graph_type], self.h_delta[graph_type], self.m[graph_type], self.v[graph_type]\
                    = self._init_params(self.node_size[graph_type], rep_size,
                                            last_emb_files, graph_type)
            self._gen_sampling_table(graph_type)

        self.anchors = self._read_anchors(anchor_file, ',')

        self.lr = lr
        self.gamma = gamma
        self.cur_epoch = 0
        self.batch_size = batch_size
        self.negative_ratio = negative_ratio

    def _init_dicts(self):
        self.g = dict()
        self.look_up = dict()
        self.idx = dict()
        self.update_dict = dict()
        self.update_look_back = dict()
        self.node_size = dict()
        self.embeddings = dict()
        self.h_delta = dict()
        self.m = dict()
        self.v = dict()
        self.node_degree = dict()
        self.sampling_table = dict()
        self.edge_alias = dict()
        self.edge_prob = dict()

    def _init_params(self, node_size, rep_size, last_emb_file, graph_type):
        embeddings = dict()
        embeddings['node'] = np.random.normal(0,1,(node_size,rep_size))
        embeddings['content'] = np.random.normal(0,1,(node_size,rep_size))
        if last_emb_file:
            embeddings['node'] = self._init_emb_matrix(embeddings['node']\
                        , '{}.node_embeddings'.format(last_emb_file[graph_type]), graph_type)
            embeddings['content'] = self._init_emb_matrix(embeddings['content']\
                        , '{}.content_embeddings'.format(last_emb_file[graph_type]), graph_type)
        # adagrad
        h_delta = dict()
        h_delta['node'] = np.zeros((node_size,rep_size))
        h_delta['content'] = np.zeros((node_size,rep_size))
        # adam
        m = dict()
        m['node'] = np.zeros((node_size,rep_size))
        m['content'] = np.zeros((node_size,rep_size))
        v = dict()
        v['node'] = np.zeros((node_size,rep_size))
        v['content'] = np.zeros((node_size,rep_size))

        return embeddings, h_delta, m, v

    def _init_emb_matrix(self, emb, emb_file, graph_type):
        with open(emb_file, 'r') as embed_handler:
            for ln in embed_handler:
                elems = ln.strip().split()
                if len(elems)<=2:
                    continue
                emb[self.look_up[graph_type][elems[0]]] = map(float, elems[1:])
        return emb

    def _read_anchors(self, anchor_file, delimiter):
        anchors = list()
        with open(anchor_file, 'r') as anchor_handler:
            for ln in anchor_handler:
                elems = ln.strip().split(delimiter)
                anchors.append((elems[0], elems[1]))
        return anchors

    def _init_simgoid_table(self):
        for k in range(self.sigmoid_table_size):
            x = 2*self.SIGMOID_BOUND*k/self.sigmoid_table_size-self.SIGMOID_BOUND
            self.sigmoid_table[k] = 1./(1+np.exp(-x))

    def _fast_sigmoid(self, val):
        if val>self.SIGMOID_BOUND:
            return 1-self.epsilon
        elif val<-self.SIGMOID_BOUND:
            return self.epsilon
        k = int((val+self.SIGMOID_BOUND)*self.sigmoid_table_size/self.SIGMOID_BOUND/2)
        return self.sigmoid_table[k]
        # return 1./(1+np.exp(-val))

    def _format_vec(self, vec, graph_type):
        len_gap = self.idx[graph_type]-len(vec)
        if len_gap>0:
            num_col = 0
            if isinstance(vec, list):
                num_col = len(vec[0])
            else:
                num_col = vec.shape[1]
            vec = np.concatenate((vec, np.zeros((len_gap, num_col))))
            # for i in range(len_gap):
            #     vec = np.append(vec, np.zeros(vec[0].shape))
        return np.array(vec)

    def _calc_delta_vec(self, nd, delta, opt_vec, graph_type):
        if nd not in self.update_dict[graph_type]:
            cur_idx = self.idx[graph_type]
            self.update_dict[graph_type][nd] = cur_idx
            self.update_look_back[graph_type].append(nd)
            self.idx[graph_type] += 1
        else:
            cur_idx = self.update_dict[graph_type][nd]
        if cur_idx>=len(delta):
            for i in range(cur_idx-len(delta)):
                delta.append(np.zeros(opt_vec.shape))
            delta.append(opt_vec)
        else:
            delta[cur_idx] += opt_vec
        return delta

    def _update_graph_by_links(self, batch, graph_type):
        '''
        x = self._binarize(self.embeddings[key])
        '''
        pos_h, pos_t, pos_h_v, neg_t = batch[graph_type]
        batch_size = len(pos_h)
        # print pos_h, pos_t, pos_h_v, neg_t

        embeddings = self.embeddings[graph_type]
        # order 2
        pos_u = embeddings['node'][pos_h,:]
        pos_v_c = embeddings['content'][pos_t,:]
        neg_u = embeddings['node'][pos_h_v,:]
        neg_v_c = embeddings['content'][neg_t,:]

        pos_e = np.sum(pos_u*pos_v_c, axis=1) # pos_e.shape = batch_size
        neg_e = np.sum(neg_u*neg_v_c, axis=2) # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([self._fast_sigmoid(val) for val in pos_e.reshape(-1)]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([self._fast_sigmoid(val) for val in neg_e.reshape(-1)]).reshape(neg_e.shape)

        # temporal delta
        delta_eh = list()
        delta_c = list()

        idx = 0
        for i in range(len(pos_t)):
            u,v = pos_h[i],pos_t[i]
            delta_c = self._calc_delta_vec(v, delta_c, (sigmoid_pos_e[i]-1)*pos_u[i,:], graph_type)
            delta_eh = self._calc_delta_vec(u, delta_eh, (sigmoid_pos_e[i]-1)*pos_v_c[i,:], graph_type)
            # print 'delta_eh',delta_eh,ndDict_order
        neg_shape = neg_e.shape
        for i in range(neg_shape[0]):
            for j in range(neg_shape[1]):
                u,v = pos_h_v[i][j],neg_t[i][j]
                delta_c = self._calc_delta_vec(v, delta_c, sigmoid_neg_e[i,j]*neg_u[i,j,:], graph_type)
                delta_eh = self._calc_delta_vec(u, delta_eh, sigmoid_neg_e[i,j]*neg_v_c[i,j,:], graph_type)
                # print sigmoid_neg_e[i,j]*neg_v_c[i,j,:], type(sigmoid_neg_e[i,j]*neg_v_c[i,j,:])
                # print 'delta_eh',delta_eh,ndDict_order


        # delta x & delta codebook
        delta_eh = self._format_vec(delta_eh, graph_type)
        delta_c = self._format_vec(delta_c, graph_type)

        # print 'in update graph by links '+graph_type
        # print self.idx[graph_type], delta_eh.shape, delta_c.shape

        return delta_c/batch_size, delta_eh/batch_size

    def _cos_sim(self, vec1, vec2):
        return np.dot(vec1,vec2)/np.linalg.norm(vec1)/np.linalg.norm(vec2)

    def _update_graph_by_anchor_reg(self):

        delta_eh = defaultdict(list)

        cnt = 0
        for src_nd, target_nd in self.anchors:
            if not src_nd in self.look_up['f'] or not target_nd in self.look_up['g']:
                continue
            types = ['f', 'g']
            idx = list() # 0 refers to network f, 1 refers to network g
            emb = list()
            idx.append(self.look_up['f'][src_nd])
            idx.append(self.look_up['g'][target_nd])
            emb.append(self.embeddings['f']['node'][idx[0]])
            emb.append(self.embeddings['g']['node'][idx[1]])
            for i in range(len(types)):
                delta_eh[types[i]] = self._calc_delta_vec(idx[i], delta_eh[types[i]]
                                , (self._cos_sim(emb[i], emb[1-i])*emb[i]/np.dot(emb[i], emb[i])
                                    -emb[1-i]/np.linalg.norm(emb[1-i])/np.linalg.norm(emb[i])), types[i])
            cnt += 1

        for graph_type in ['f', 'g']:
            delta_eh[graph_type] = self._format_vec(delta_eh[graph_type], graph_type)/cnt
            # print 'in update graph by anchor reg ' + graph_type
            # print self.idx[graph_type], delta_eh[graph_type].shape

        return delta_eh

    def _mat_add(self, mat1, mat2):
        # print '****mat add****'
        # print mat1, mat2
        len_gap = len(mat1)-len(mat2)
        # print len_gap
        if len_gap>0:
            for i in range(len_gap):
                mat2 = np.vstack((mat2, np.zeros(mat2[0,:].shape)))
                # print mat2
        else:
            for i in range(-len_gap):
                mat1 = np.vstack((mat1, np.zeros(mat1[0,:].shape)))
        #         print mat1
        # print len(mat1), len(mat2)
        return mat1+mat2

    def get_graph_loss(self, batch, graph_type):
        pos_h, pos_t, pos_h_v, neg_t = batch[graph_type]

        embeddings = self.embeddings[graph_type]
        # order 2
        pos_u = embeddings['node'][pos_h,:]
        pos_v_c = embeddings['content'][pos_t,:]
        neg_u = embeddings['node'][pos_h_v,:]
        neg_v_c = embeddings['content'][neg_t,:]

        pos_e = np.sum(pos_u*pos_v_c, axis=1) # pos_e.shape = batch_size
        neg_e = np.sum(neg_u*neg_v_c, axis=2) # neg_e.shape = batch_size*negative_ratio

        sigmoid_pos_e = np.array([self._fast_sigmoid(val) for val in pos_e.reshape(-1)]).reshape(pos_e.shape)
        sigmoid_neg_e = np.array([self._fast_sigmoid(val) for val in neg_e.reshape(-1)]).reshape(neg_e.shape)

        return -np.mean(np.log(sigmoid_pos_e)+np.sum(np.log(1-sigmoid_neg_e), axis=1))

    def get_anchor_reg_loss(self):

        cos_sim_list = list()

        for src_nd, target_nd in self.anchors:
            if not src_nd in self.look_up['f'] or not target_nd in self.look_up['g']:
                continue
            src_idx = self.look_up['f'][src_nd]
            target_idx = self.look_up['g'][target_nd]
            
            cos_sim_list.append(self._cos_sim(self.embeddings['f']['node'][src_idx]
                                    , self.embeddings['g']['node'][target_idx]))

        return -np.mean(cos_sim_list)

    def update_vec(self, h_delta, delta, embeddings, len_delta, t, graph_type):
        update_look_back = self.update_look_back[graph_type]
        h_delta[update_look_back[:len_delta],:] += delta**2
        # print 'original embedding:',embeddings[self.update_look_back[cal_type][:len_delta]]
        embeddings[update_look_back[:len_delta],:] -= \
                        self.lr/np.sqrt(h_delta[update_look_back[:len_delta],:])*delta
        # print 'delta:',delta
        # print 'h_delta:',h_delta[self.update_look_back[cal_type][:len_delta]]
        # print 'embeddings:',embeddings[self.update_look_back[cal_type][:len_delta]]
        # print 'lmd_rda:',elem_lbd
        return h_delta, embeddings

    def update_vec_by_adam(self, m, v, delta, embeddings, len_delta, t, graph_type):
        self.beta1 = .9
        self.beta2 = .999
        update_look_back = self.update_look_back[graph_type]
        m[update_look_back[:len_delta],:] = \
            self.beta1*m[update_look_back[:len_delta],:]+(1-self.beta1)*delta
        v[update_look_back[:len_delta],:] = \
            self.beta2*v[update_look_back[:len_delta],:]+(1-self.beta2)*(delta**2)
        m_ = m[update_look_back[:len_delta],:]/(1-self.beta1**t)
        v_ = v[update_look_back[:len_delta],:]/(1-self.beta2**t)

        embeddings[update_look_back[:len_delta],:] -= self.lr*m_/(np.sqrt(v_)+self.epsilon)

        return m,v,embeddings

    def train_one_epoch(self, opt_type):
        DISPLAY_EPOCH=100

        def batch_init():
            for graph_type in ['f', 'g']:
                self.idx[graph_type] = 0
                self.update_look_back[graph_type] = list()
                self.update_dict[graph_type] = dict()

        batches = self.batch_iter()
        last_batch_loss = 1e8
        stop_cnt = 0
        for batch in batches:
            batch_loss = .0
            batch_init()
            delta_eh_anchor_reg = self._update_graph_by_anchor_reg()
            for graph_type in ['f', 'g']:
                # init
                h_delta = self.h_delta[graph_type]
                embeddings = self.embeddings[graph_type]
                m = self.m[graph_type]
                v = self.v[graph_type]
                # end
                delta_c, delta_eh = self._update_graph_by_links(batch, graph_type)
                delta_eh_anchor_reg[graph_type] = self._format_vec(delta_eh_anchor_reg[graph_type], graph_type)
                # print 'in train one epoch'
                # print self.idx[graph_type], delta_eh_anchor_reg[graph_type].shape, delta_eh.shape
                len_delta = len(delta_eh)
                # print 'order2, nd'
                if opt_type=='adagrad':
                    h_delta['node'], embeddings['node'] = \
                                        self.update_vec(h_delta['node']
                                                    , delta_eh+self.gamma*delta_eh_anchor_reg[graph_type]
                                                    , embeddings['node'], len_delta, self.t, graph_type)
                if opt_type=='adam':
                    m['node'], self.v['node'], embeddings['node'] = \
                                    self.update_vec_by_adam(m['node'], v['node']
                                                    , delta_eh+self.gamma*delta_eh_anchor_reg[graph_type]
                                                    , embeddings['node'], len_delta, self.t, graph_type)
                len_content = len(delta_c)
                # print 'order2, content'
                if opt_type=='adagrad':
                    h_delta['content'], embeddings['content'] = \
                                        self.update_vec(h_delta['content'], delta_c
                                                    , embeddings['content'], len_content, self.t, graph_type)
                if opt_type=='adam':
                    m['content'], v['content'], embeddings['content'] = \
                                    self.update_vec_by_adam(m['content'], v['content'], delta_c
                                                    , embeddings['content'], len_content, self.t, graph_type)
                if (self.t-1)%DISPLAY_EPOCH==0:
                    batch_loss += self.get_graph_loss(batch, graph_type)+self.gamma*self.get_anchor_reg_loss()
            if (self.t-1)%DISPLAY_EPOCH==0:
                self.logger.info('Finish processing batch {} and loss:{}'.format(self.t-1, batch_loss))
                if batch_loss<last_batch_loss:
                    last_batch_loss = batch_loss
                    stop_cnt = 0
                else:
                    stop_cnt += 1
                if stop_cnt>=2:
                    break
            self.t += 1
        self.cur_epoch += 1

    def get_random_node_pairs(self, i, shuffle_indices, edges, edge_set, numNodes, graph_type):
        # balance the appearance of edges according to edge_prob
        edge_prob = self.edge_prob[graph_type]
        edge_alias = self.edge_alias[graph_type]
        sampling_table = self.sampling_table[graph_type]
        if i>=len(shuffle_indices):
            i = np.random.randint(len(shuffle_indices))
        if not random.random() < edge_prob[shuffle_indices[i]]:
            shuffle_indices[i] = edge_alias[shuffle_indices[i]]
        cur_h = edges[shuffle_indices[i]][0]
        head = cur_h*numNodes
        cur_t = edges[shuffle_indices[i]][1]
        cur_h_v = []
        cur_neg_t = []
        for j in range(self.negative_ratio):
            rn = sampling_table[random.randint(0, self.table_size-1)]
            while head+rn in edge_set or cur_h == rn or rn in cur_neg_t:
                rn = sampling_table[random.randint(0, self.table_size-1)]
            cur_h_v.append(cur_h)
            cur_neg_t.append(rn)
        return cur_h, cur_t, cur_h_v, cur_neg_t

    def batch_iter(self):

        data_size = 0
        for graph_type in ['f', 'g']:
            net_size = self.g[graph_type].G.size()
            if net_size > data_size:
                data_size = net_size

        shuffle_indices = dict()
        for graph_type in ['f', 'g']:
            net_size = self.g[graph_type].G.size()
            shuffle_indices[graph_type] = np.random.permutation(np.arange(net_size))
            while net_size<data_size:
                shuffle_indices[graph_type] = np.append(shuffle_indices[graph_type], shuffle_indices[graph_type][:data_size-net_size])
                net_size = len(shuffle_indices[graph_type])

        start_index = 0
        end_index = min(start_index+self.batch_size, data_size)
        while start_index < data_size:
            ret = dict()

            for graph_type in ['f', 'g']:
                numNodes = self.node_size[graph_type]
                look_up = self.look_up[graph_type]
                g = self.g[graph_type]
                edges = [(look_up[x[0]], look_up[x[1]]) for x in g.G.edges()]
                edge_set = set([x[0]*numNodes+x[1] for x in edges])
                pos_h = []
                pos_t = []
                pos_h_v = []
                neg_t = []
                for i in range(start_index, end_index):
                    cur_h, cur_t, cur_h_v, cur_neg_t\
                        = self.get_random_node_pairs(i, shuffle_indices[graph_type], edges, edge_set, numNodes, graph_type)
                    pos_h.append(cur_h)
                    pos_t.append(cur_t)
                    pos_h_v.append(cur_h_v)
                    neg_t.append(cur_neg_t)
                ret[graph_type] = (pos_h, pos_t, pos_h_v, neg_t)

            start_index = end_index
            end_index = min(start_index+self.batch_size, data_size)

            yield ret

    def _gen_sampling_table(self, graph_type):
        table_size = self.table_size
        power = 0.75

        print("Pre-procesing for non-uniform negative sampling in {}!".format(graph_type))
        numNodes = self.node_size[graph_type]
        g = self.g[graph_type]

        node_degree = np.zeros(numNodes) # out degree
        look_up = g.look_up_dict
        for edge in g.G.edges():
            node_degree[look_up[edge[0]]] += g.G[edge[0]][edge[1]]['weight']

        norm = sum([math.pow(node_degree[i], power) for i in range(numNodes)])

        p = 0
        i = 0
        sampling_table = np.zeros(int(table_size), dtype=np.uint32)
        for j in range(numNodes):
            p += float(math.pow(node_degree[j], power)) / norm
            while i < table_size and float(i) / table_size < p:
                sampling_table[i] = j
                i += 1

        data_size = g.G.size()
        edge_alias = np.zeros(data_size, dtype=np.int32)
        edge_prob = np.zeros(data_size, dtype=np.float32)
        large_block = np.zeros(data_size, dtype=np.int32)
        small_block = np.zeros(data_size, dtype=np.int32)

        total_sum = sum([g.G[edge[0]][edge[1]]["weight"] for edge in g.G.edges()])
        norm_prob = [g.G[edge[0]][edge[1]]["weight"]*data_size/total_sum for edge in g.G.edges()]
        num_small_block = 0
        num_large_block = 0
        cur_small_block = 0
        cur_large_block = 0
        for k in range(data_size-1, -1, -1):
            if norm_prob[k] < 1:
                small_block[num_small_block] = k
                num_small_block += 1
            else:
                large_block[num_large_block] = k
                num_large_block += 1
        while num_small_block and num_large_block:
            num_small_block -= 1
            cur_small_block = small_block[num_small_block]
            num_large_block -= 1
            cur_large_block = large_block[num_large_block]
            edge_prob[cur_small_block] = norm_prob[cur_small_block]
            edge_alias[cur_small_block] = cur_large_block
            norm_prob[cur_large_block] = norm_prob[cur_large_block] + norm_prob[cur_small_block] -1
            if norm_prob[cur_large_block] < 1:
                small_block[num_small_block] = cur_large_block
                num_small_block += 1
            else:
                large_block[num_large_block] = cur_large_block
                num_large_block += 1

        while num_large_block:
            num_large_block -= 1
            edge_prob[large_block[num_large_block]] = 1
        while num_small_block:
            num_small_block -= 1
            edge_prob[small_block[num_small_block]] = 1

        self.node_degree[graph_type] = node_degree
        self.sampling_table[graph_type] = sampling_table
        self.edge_alias[graph_type] = edge_alias
        self.edge_prob[graph_type] = edge_prob

    def get_one_embeddings(self, embeddings, graph_type):
        vectors = dict()
        look_back = self.g[graph_type].look_back_list
        for i, embedding in enumerate(embeddings):
            vectors[look_back[i]] = embedding
        return vectors

    def get_vectors(self):
        ret = defaultdict(dict)
        content_embeddings = defaultdict(dict)

        for graph_type in ['f', 'g']:
            node_embeddings=self.get_one_embeddings(self.embeddings[graph_type]['node'], graph_type)
            ret[graph_type]['node_embeddings']=node_embeddings

            content_embeddings=self.get_one_embeddings(self.embeddings[graph_type]['content'], graph_type)
            ret[graph_type]['content_embeddings']=content_embeddings

        return ret
Esempio n. 23
0
class PALE_MLP(object):
    def __init__(self, learning_rate, batch_size, n_input, n_hidden, n_layer,
                 device, files, log_file):
        if os.path.exists('log/' + log_file + '.log'):
            os.remove('log/' + log_file + '.log')
        self.logger = LogHandler(log_file)

        self.device = device

        # Parameters
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.valid_prop = .9
        self.valid_sample_size = 9

        self.cur_epoch = 1

        # Network Parameters
        self.n_hidden = n_hidden  # number of neurons in hidden layer
        self.n_input = n_input  # size of node embeddings
        self.n_layer = n_layer  # number of layer

        # Set Train Data
        if not isinstance(files, list) and len(files) < 3:
            self.logger.info(
                'The alogrihtm needs files like [First Graph File, Second Graph File, Label File]'
            )
            return

        # tf Graph input
        self.lookup_f = dict()
        self.lookup_g = dict()
        self.look_back_f = list()
        self.look_back_g = list()
        self._read_train_dat(files[0], files[1],
                             files[2])  # douban, weibo, label files
        self.valid_sample_size = min(
            min(self.valid_sample_size,
                len(self.look_back_f) - 1),
            len(self.look_back_g) - 1)

        # TF Graph Building
        self.sess = tf.Session()
        cur_seed = random.getrandbits(32)
        initializer = tf.contrib.layers.xavier_initializer(uniform=False,
                                                           seed=cur_seed)
        with tf.device(self.device):
            with tf.variable_scope("model",
                                   reuse=None,
                                   initializer=initializer):
                self.mlp_weights()
                self.build_train_graph()
                self.build_valid_graph()
            self.sess.run(tf.global_variables_initializer())

    def _read_labels(self, label_file):
        labels = list()
        with open(label_file, 'r') as lb_handler:
            for ln in lb_handler:
                ln = ln.strip()
                if not ln:
                    break
                labels.append(ln.split())
        return labels

    def _read_embeddings(self, embed_file, lookup, look_back):
        embedding = list()
        with open(embed_file, 'r') as emb_handler:
            idx = 0
            for ln in emb_handler:
                ln = ln.strip()
                if ln:
                    elems = ln.split()
                    if len(elems) == 2:
                        continue
                    embedding.append(map(float, elems[1:]))
                    lookup[elems[0]] = idx
                    look_back.append(elems[0])
                    idx += 1
        return np.array(embedding), lookup, look_back

    def _read_train_dat(self, embed1_file, embed2_file, label_file):
        self.L = load_train_valid_labels(label_file, self.valid_prop)
        self.X, self.lookup_f, self.look_back_f = self._read_embeddings(
            embed1_file, self.lookup_f, self.look_back_f)
        self.Y, self.lookup_g, self.look_back_g = self._read_embeddings(
            embed2_file, self.lookup_g, self.look_back_g)

    def mlp_weights(self):
        # Store layers weight & bias
        self.weights = dict()
        self.biases = dict()
        self.weights['h0'] = tf.Variable(
            tf.random_normal([self.n_input, self.n_hidden]))
        self.biases['b0'] = tf.Variable(tf.zeros([self.n_hidden]))
        for i in range(1, self.n_layer):
            self.weights['h{}'.format(i)] = tf.Variable(
                tf.random_normal([self.n_hidden, self.n_hidden]))
            self.biases['b{}'.format(i)] = tf.Variable(
                tf.zeros([self.n_hidden]))
        self.weights['out'] = tf.Variable(
            tf.random_normal([self.n_hidden, self.n_input]))
        self.biases['b_out'] = tf.Variable(tf.zeros([self.n_input]))

    def build_code_graph(self, inputs):

        # Input layer
        layer = tf.nn.sigmoid(
            tf.add(
                tf.matmul(tf.reshape(inputs, [-1, self.n_input]),
                          self.weights['h0']), self.biases['b0']))
        for i in range(1, self.n_layer):
            layer = tf.nn.sigmoid(
                tf.add(tf.matmul(layer, self.weights['h{}'.format(i)]),
                       self.biases['b{}'.format(i)]))
        # Output fully connected layer with a neuron
        code = tf.nn.tanh(
            tf.matmul(layer, self.weights['out']) + self.biases['b_out'])

        return code

    def build_lin_code_graph(self, inputs):

        # Output fully connected layer with a neuron
        code = tf.matmul(tf.reshape(inputs, [-1, self.n_input]),
                         self.weights['out']) + self.biases['b_out']

        return code

    def build_train_graph(self):

        self.cur_batch_size = tf.placeholder('float32', name='batch_size')

        self.pos_f_inputs = tf.placeholder('float32', [None, self.n_input])
        self.pos_g_inputs = tf.placeholder('float32', [None, self.n_input])

        self.PF = self.build_code_graph(
            self.pos_f_inputs)  # batch_size*n_input

        # train loss
        self.loss = tf.reduce_mean(.5 * tf.square(self.PF - self.pos_g_inputs))

        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = optimizer.minimize(self.loss)

    def build_valid_graph(self):

        # validation
        self.valid_f_inputs = tf.placeholder(
            'float32', [None, self.valid_sample_size, self.n_input])
        self.valid_g_inputs = tf.placeholder(
            'float32', [None, self.valid_sample_size, self.n_input])

        valid_f = tf.reshape(self.build_code_graph(self.valid_f_inputs),
                             [-1, self.valid_sample_size, self.n_input
                              ])  # batch_size*neg_ratio*n_input
        self.dot_dist = tf.reduce_sum(tf.pow(valid_f - self.valid_g_inputs,
                                             2.),
                                      axis=2)
        # self.hamming_dist = tf.reduce_sum(
        # 						tf.clip_by_value(tf.sign(tf.multiply(tf.sign(valid_f),tf.sign(valid_g))),.0,1.)
        # 							, axis=2
        # 						)

    def train_one_epoch(self):
        sum_loss = 0.0

        # train process
        # with tf.device(self.device):
        batches = batch_iter(self.L, self.batch_size, 0\
                , self.lookup_f, self.lookup_g, 'f', 'g')
        batch_id = 0
        for batch in batches:
            pos_f, pos_g, neg_f, neg_g = batch
            if not len(pos_f) == len(pos_g):
                self.logger.info(
                    'The input label file goes wrong as the file format.')
                continue
            batch_size = len(pos_f)
            feed_dict = {
                self.pos_f_inputs: self.X[pos_f, :],
                self.pos_g_inputs: self.Y[pos_g, :],
                self.cur_batch_size: batch_size
            }
            _, cur_loss = self.sess.run([self.train_op, self.loss], feed_dict)

            sum_loss += cur_loss
            # self.logger.info('Finish processing batch {} and cur_loss={}'
            #                        .format(batch_id, cur_loss))
            batch_id += 1
        # valid process
        valid_f, valid_g = valid_iter(self.L, self.valid_sample_size,
                                      self.lookup_f, self.lookup_g, 'f', 'g')
        # print valid_f,valid_g
        if not len(valid_f) == len(valid_g):
            self.logger.info(
                'The input label file goes wrong as the file format.')
            return
        valid_size = len(valid_f)
        feed_dict = {
            self.valid_f_inputs: self.X[valid_f, :],
            self.valid_g_inputs: self.Y[valid_g, :]
        }
        valid_dist = self.sess.run(self.dot_dist, feed_dict)
        # valid_dist = self.sess.run(self.hamming_dist,feed_dict)
        mrr = .0
        for i in range(valid_size):
            fst_dist = valid_dist[i][0]
            pos = 1
            for k in range(1, len(valid_dist[i])):
                if fst_dist >= valid_dist[i][k]:
                    pos += 1
            # print pos
            # self.logger.info('dist:{},pos:{}'.format(fst_dist,pos))
            # print valid_dist[i]
            mrr += 1. / pos
        self.logger.info('Epoch={}, sum of loss={!s}, mrr={}'.format(
            self.cur_epoch, sum_loss / batch_id, mrr / valid_size))
        # print 'mrr:',mrr/valid_size
        # self.logger.info('Epoch={}, sum of loss={!s}, valid_loss={}'
        #                     .format(self.cur_epoch, sum_loss/batch_id, valid_loss))
        self.cur_epoch += 1

    def _write_in_file(self, filename, vec, tag):
        with open(filename, 'aw') as res_handler:
            if len(vec.shape) > 1:
                column_size = vec.shape[1]
            else:
                column_size = 1
            reshape_vec = vec.reshape(-1)
            vec_size = len(reshape_vec)
            res_handler.write(tag + '\n')
            for i in range(0, vec_size, column_size):
                res_handler.write('{}\n'.format(' '.join(
                    [str(reshape_vec[i + k]) for k in range(column_size)])))

    def save_models(self, filename):
        if os.path.exists(filename):
            os.remove(filename)
        for k, v in self.weights.iteritems():
            self._write_in_file(filename, v.eval(self.sess), k)
        for k, v in self.biases.iteritems():
            self._write_in_file(filename, v.eval(self.sess), k)
Esempio n. 24
0
def main(args):
    t1 = time.time()

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id

    logger = LogHandler('RUN.' +
                        time.strftime('%Y-%m-%d', time.localtime(time.time())))
    logger.info(args)

    SAVING_STEP = args.saving_step
    MAX_EPOCHS = args.max_epochs
    files = {
        'feat-src': args.feature_src,
        'feat-end': args.feature_end,
        'linkage': args.identity_linkage
    }
    if args.method == 'half-sp':
        model = HALF_SP(learning_rate=args.lr,
                        batch_size=args.batch_size,
                        neg_ratio=args.neg_ratio,
                        gamma=args.gamma,
                        eta=args.eta,
                        n_input=args.input_size,
                        n_out=args.output_size,
                        n_hidden=args.hidden_size,
                        n_layer=args.layers,
                        is_valid=args.is_valid,
                        files=files,
                        type_model=args.type_model,
                        log_file=args.log_file,
                        device=args.device)
    if args.method == 'half-dp':
        model = HALF_DP(learning_rate=args.lr,
                        batch_size=args.batch_size,
                        neg_ratio=args.neg_ratio,
                        gamma=args.gamma,
                        eta=args.eta,
                        n_input=args.input_size,
                        n_out=args.output_size,
                        n_hidden=args.hidden_size,
                        n_layer=args.layers,
                        is_valid=args.is_valid,
                        files=files,
                        type_model=args.type_model,
                        log_file=args.log_file,
                        device=args.device)

    losses = np.zeros(MAX_EPOCHS)
    val_scrs = np.zeros(MAX_EPOCHS)
    best_scr = .0
    best_epoch = 0
    thres = 3
    for i in range(1, MAX_EPOCHS + 1):
        losses[i - 1], val_scrs[i - 1] = model.train_one_epoch()
        if i % SAVING_STEP == 0:
            loss_mean = np.mean(losses[i - SAVING_STEP:i])
            scr_mean = np.mean(val_scrs[i - SAVING_STEP:i])
            logger.info(
                'loss in last {} epoches: {}, validation in last {} epoches: {}'
                .format(SAVING_STEP, loss_mean, SAVING_STEP, scr_mean))
            if scr_mean > best_scr:
                best_scr = scr_mean
                best_epoch = i
                model.save_models(args.output)
            if args.early_stop and i >= thres * SAVING_STEP:
                cnt = 0
                for k in range(thres - 1, -1, -1):
                    cur_val = np.mean(val_scrs[i - (k + 1) * SAVING_STEP:i -
                                               k * SAVING_STEP])
                    if cur_val <= best_scr:
                        cnt += 1
                if cnt == thres and (i - best_epoch) >= thres * SAVING_STEP:
                    logger.info('*********early stop*********')
                    logger.info(
                        'The best epoch: {}\nThe validation score: {}'.format(
                            best_epoch, best_scr))
                    break
    t2 = time.time()
    print('time cost:', t2 - t1)