def interactiveGRUTraining( trainingDataFile=main_dir + 'facebook.splits/train.10/train_classmate_1', wordsEmbeddings=None, wordsEmbeddings_path=main_dir + 'facebook/nodesFeatures', typesEmbeddings=None, typesEmbeddings_path='', word_dimension=22, type_dimension=20, dimension=64, attention_dimension=12, wordsSize=1000000, subpaths_map=None, subpaths_file=main_dir + 'facebook/subpathsSaveFile', sequences_map=None, sequences_file='', maxlen_subpaths=1000, maxlen=100, # Sequence longer then this get ignored batch_size=1, is_shuffle_for_batch=False, alpha=0.1, beta=0.1, gamma=0.1, objective_function_method='hinge-loss', objective_function_param=0, lrate=0.0001, max_epochs=10, dispFreq=5, saveFreq=5, saveto=main_dir + 'facebook/path2vec-modelParams.npz', decay=0.01, ): model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings( wordsEmbeddings_path) else: print 'Exit...' exit(0) if typesEmbeddings is None: if typesEmbeddings_path is not None: typesEmbeddings, type_dimension, wordsSize = dataProcessTools.getTypesEmbeddings( typesEmbeddings_path) else: print 'Exit...' exit(0) trainingData, trainingPairsData = dataProcessTools.getTrainingData( trainingDataFile) allBatches = dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch) sequences_data = dataProcessTools.readAllSequencesFromFile(sequences_file) params = init_sharedVariables(model_options) tparams = init_tparams(params) print 'Generate models ......' trainingParis, sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, wordsEmbs, typesEmbs, masks_matrix, groups_tensor, cost = interactiveGRULearningBatch.interactiveGRULearning( model_options, tparams) print 'Generate gradients ......' grads = tensor.grad(cost, wrt=list(tparams.values())) print 'Using Adadelta to generate functions ......' lr = tensor.scalar(name='lr') f_grad_shared, f_update = adadelta( lr, tparams, grads, trainingParis, sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, wordsEmbs, typesEmbs, masks_matrix, groups_tensor, cost) print 'Start training models ......' best_p = None history_cost = [] start_time = time.time() print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) uidx = 0 for eidx in range(max_epochs): for _, batch in allBatches: uidx += 1 trainingDataForBatch = [trainingData[i] for i in batch] trainingPairsForBatch = [trainingPairsData[i] for i in batch] trainingParis_data, sequences_matrix_data, dependency_matrix_data, dependWeight_matrix_data, sequencesLen_vector_data, discountSeq_matrix_data, discountForEachNode_matrix_data, masks_matrix_data, groups_tensor_data = dataProcessTools.prepareDataForTrainingBatch( trainingDataForBatch, trainingPairsForBatch, sequences_data, alpha, beta, gamma) if len(trainingParis_data) == 0: continue cost = f_grad_shared( trainingParis_data, sequences_matrix_data, dependency_matrix_data, dependWeight_matrix_data, sequencesLen_vector_data, discountSeq_matrix_data, discountForEachNode_matrix_data, wordsEmbeddings, typesEmbeddings, masks_matrix_data, groups_tensor_data) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print('bad cost detected: ', cost) return if numpy.mod(uidx, dispFreq) == 0: print 'Epoch =', eidx, ', Update =', uidx, ', Cost =', cost if saveto and numpy.mod(uidx, saveFreq) == 0: print 'Saving... time ==', time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_cost, **params) pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print('Done') end_time = time.time() print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time)) print 'Training finished! Cost time == ', end_time - start_time, ' s'
def proxEmbedBySubgraphs( trainingDataFile=main_dir + 'train_classmate', wordsEmbeddings_data=None, wordsEmbeddings_path=main_dir + 'facebook/nodesFeatures', subpaths_map=None, subpaths_file=main_dir + 'facebook/subpathsSaveFile', subgraphSaveFile='', maxlen_subpaths=1000, wordsSize=1000000, maxlen=100, batch_size=1, is_shuffle_for_batch=False, dispFreq=5, saveFreq=5, saveto=main_dir + 'facebook/path2vec-modelParams.npz', lrate=0.0001, word_dimension=22, dimension=64, discount_alpha=0.3, discount_beta=0.3, h_output_method='max-pooling', objective_function_method='hinge-loss', objective_function_param=0, max_epochs=10, decay=0.01, ): model_options = locals().copy() if wordsEmbeddings_data is None: if wordsEmbeddings_path is not None: wordsEmbeddings_data, word_dimension, wordsSize = dataProcessTools.getWordsEmbeddings( wordsEmbeddings_path) else: exit(0) trainingData, trainingPairs_data = dataProcessTools.getTrainingData( trainingDataFile) allBatches = dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch) subgraphs = dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths( subgraphSaveFile) params = init_sharedVariables(model_options) tparams = init_tparams(params) print 'Generate models ......' trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost = proxEmbedBySubgraphModel.proxEmbedBySubgraphModel( model_options, tparams) print 'Generate gradients ......' grads = tensor.grad(cost, wrt=list(tparams.values())) print 'Using Adadelta to generate functions ......' this_time = time.time() print 'Start to compile and optimize, time ==', time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(this_time)) lr = tensor.scalar(name='lr') f_grad_shared, f_update = adadelta(lr, tparams, grads, trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost) print 'Start training models ......' best_p = None history_cost = [] start_time = time.time() print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)) uidx = 0 for eidx in range(max_epochs): for _, batch in allBatches: uidx += 1 trainingDataForBatch = [trainingData[i] for i in batch] trainingPairsForBatch = [trainingPairs_data[i] for i in batch] tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data, nodesLens_data = dataProcessTools.generateSequenceAndMasksForSingleSequenceWithLength( trainingDataForBatch, trainingPairsForBatch, subgraphs, dimension) cost = f_grad_shared(tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings_data, buffer_tensor_data, nodesLens_data) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print('bad cost detected: ', cost) return if numpy.mod(uidx, dispFreq) == 0: print 'Epoch =', eidx, ', Update =', uidx, ', Cost =', cost this_time = time.time() print 'Time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(this_time)) if saveto and numpy.mod(uidx, saveFreq) == 0: print('Saving...') if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_cost, **params) pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print('Done') gc.collect() end_time = time.time() print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time)) print 'Training finished! Cost time == ', end_time - start_time, ' s'
def compute_proxEmbedBySubgraph( wordsEmbeddings=None, wordsEmbeddings_path=None, word_dimension=0, dimension=0, wordsSize=0, subpaths_map=None, subpaths_file=None, subgraphs_file='', maxlen_subpaths=1000, maxlen=100, # Sequence longer then this get ignored test_data_file='', top_num=10, ideal_data_file='', func=None, ): model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) else: exit(0) subgraphs_map=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphs_file) line_count=0 test_map={} print 'Compute MAP and nDCG for file ',test_data_file with open(test_data_file) as f: for l in f: arr=l.strip().split() query=int(arr[0]) map={} for i in range(1,len(arr)): candidate=int(arr[i]) sequences_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data,nodesLens_data=dataProcessTools.prepareDataForTestForSubgraphSingleSequenceWithLengthsAsymmetric(query, candidate, subgraphs_map, dimension) if sequences_data is None and mask_data is None and lens_data is None: map[candidate]=-1000. else: value=func(sequences_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings, buffer_tensor_data, nodesLens_data) map[candidate]=value tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count]=tops_in_line line_count+=1 line_count=0 ideal_map={} with open(ideal_data_file) as f: for l in f: arr=l.strip().split() arr=[int(x) for x in arr] ideal_map[line_count]=arr[1:] line_count+=1 MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map) return MAP,MnDCG
def compute_metagraphAttention( wordsEmbeddings=None, # words embeddings wordsEmbeddings_path=None, # the file path of words embeddings metagraphEmbeddings_path=None, # the file path of metagraph embeddings wordsSize=0, # the size of words vocabulary subpaths_map=None, # contains sub-paths subpaths_file=None, # the file which contains sub-paths maxlen_subpaths=1000, # the max length for sub-paths test_data_file='', # test data file top_num=10, # top num in experiments ideal_data_file='', # ideal data file func=None, # the MPE process model ): """ evaluate the MPE model """ model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) else: print 'There is not path for wordsEmbeddings, exit!!!' exit(0) if subpaths_map is None: if subpaths_file is not None: subpaths_map=dataProcessTools.loadAllSubPathsRomove0Path(subpaths_file, maxlen_subpaths, wordsEmbeddings) else: print 'There is not path for sub-paths, exit!!!' exit(0) metagraphEmbedding_data, metagraphDimension, metagraphSize=dataProcessTools.getMetagraphEmbeddings(metagraphEmbeddings_path) line_count=0 test_map={} print 'Compute MAP and nDCG for file ',test_data_file with open(test_data_file) as f: for l in f: arr=l.strip().split() query=int(arr[0]) map={} for i in range(1,len(arr)): candidate=int(arr[i]) subPaths_matrix_data,subPaths_mask_data,subPaths_lens_data=dataProcessTools.prepareDataForTest(query, candidate, subpaths_map) if subPaths_matrix_data is None and subPaths_mask_data is None and subPaths_lens_data is None: map[candidate]=-1000. else: value=func(metagraphEmbedding_data, subPaths_matrix_data, subPaths_mask_data, wordsEmbeddings) map[candidate]=value del subPaths_matrix_data del subPaths_mask_data del subPaths_lens_data tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count]=tops_in_line line_count+=1 map=None gc.collect() line_count=0 ideal_map={} with open(ideal_data_file) as f: for l in f: arr=l.strip().split() arr=[int(x) for x in arr] ideal_map[line_count]=arr[1:] line_count+=1 MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map) return MAP,MnDCG
def compute_proxEmbed( wordsEmbeddings=None, # words embeddings wordsEmbeddings_path=None, # the file path of words embeddings word_dimension=0, # dimension of words embeddings dimension=0, # the dimension of paths embeddings wordsSize=0, # the size of words vocabulary subpaths_map=None, # contains sub-paths subpaths_file=None, # the file which contains sub-paths maxlen_subpaths=1000, # the max length for sub-paths maxlen=100, # Sequence longer then this get ignored test_data_file='', # the file path of test data top_num=10, # the top num to predict ideal_data_file='', # ground truth func=None, # model function ): """ compute the result of the model """ model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings( wordsEmbeddings_path) else: print 'There is not path for wordsEmbeddings, exit!!!' exit(0) if subpaths_map is None: if subpaths_file is not None: subpaths_map = dataProcessTools.loadAllSubPaths( subpaths_file, maxlen_subpaths) else: print 'There is not path for sub-paths, exit!!!' exit(0) line_count = 0 test_map = {} print 'Compute MAP and nDCG for file ', test_data_file with open(test_data_file) as f: for l in f: arr = l.strip().split() query = int(arr[0]) map = {} for i in range(1, len(arr)): candidate = int(arr[i]) subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data = dataProcessTools.prepareDataForTest( query, candidate, subpaths_map) if subPaths_matrix_data is None and subPaths_mask_data is None and subPaths_lens_data is None: map[candidate] = -1000. else: value = func(subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data, wordsEmbeddings) map[candidate] = value tops_in_line = toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count] = tops_in_line line_count += 1 line_count = 0 ideal_map = {} with open(ideal_data_file) as f: for l in f: arr = l.strip().split() arr = [int(x) for x in arr] ideal_map[line_count] = arr[1:] line_count += 1 MAP = evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG = evaluateTools.get_MnDCG(top_num, ideal_map, test_map) return MAP, MnDCG
def experiment_for_batch(num): cf = ConfigParser.SafeConfigParser() cf.read("pythonParamsConfig") suffix = "5" index = "1" class_name = cf.get("param", "class_name") # the relation name of data trainingDataFile = os.path.join(main_dir + '/', dataset_name + '.splits', 'train.' + suffix, 'train_' + class_name + '_' + '1') # the full path of training data file. This path will be generated by main_dir, dataset_name, suffix, class_name and index. wordsEmbeddings = None # words embeddings dimension = cf.getint("param", "dimension") # the dimension of paths embeddings wordsSize = cf.getint("param", "wordsSize") # the size of words vocabulary subpaths_map = None # contains sub-paths # subpaths_file = cf.get("param", "subpaths_file") # the file which contains sub-paths maxlen_subpaths = cf.getint("param", "maxlen_subpaths") # the max length for sub-paths h_output_method = cf.get("param", "h_output_method") # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path. maxlen = cf.getint("param", "maxlen") # Sequence longer than this get ignored # batch_size = cf.getint("param", "batch_size") # use a batch for training. This is the size of this batch. is_shuffle_for_batch = cf.getboolean("param", "is_shuffle_for_batch") # if need shuffle for training discount_alpha = cf.getfloat("param", "discount_alpha") # the parameter alpha for discount. The longer the subpath, the little will the weight be. subpaths_pooling_method = cf.get("param", "subpaths_pooling_method") # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling. objective_function_method = cf.get("param", "objective_function_method") # loss function, we use sigmoid objective_function_param = cf.getfloat("param", "objective_function_param") # the parameter in loss function, beta lrate = cf.getfloat("param", "lrate") # learning rate # max_epochs = cf.getint("param", "max_epochs") # the max epochs for training dispFreq = cf.getint("param", "dispFreq") # the frequences for display saveFreq = cf.getint("param", "saveFreq") # the frequences for saving the parameters saveto = os.path.join(main_dir + '/', dataset_name + '.trainModels', 'train.' + suffix, 'train_' + class_name + '_' + index + '.npz') # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. # the normalization of this model, l2-norm of all parameters decay_lstm_W = cf.getfloat("param", "decay_lstm_W") decay_lstm_U = cf.getfloat("param", "decay_lstm_U") decay_lstm_b = cf.getfloat("param", "decay_lstm_b") decay_w = cf.getfloat("param", "decay_w") test_data_file = os.path.join(main_dir + '/', dataset_name + '.splits', 'test', 'test_' + class_name + '_' + index) # the file of test data top_num = cf.getint("param", "top_num") # the top num to predict ideal_data_file = os.path.join(main_dir + '/', dataset_name + '.splits', 'ideal', 'ideal_' + class_name + '_') # the file of ground truth print("taringDatafile:", trainingDataFile) print("wordsEmbeddings_path:", wordsEmbeddings_path) test_file = [] NDCG10 = [] NDCG20 = [] if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) # print("wordsEmbeddings:", wordsEmbeddings.shape, dimension, wordsSize) else: print 'There is not path for wordsEmbeddings, exit!!!' exit(0) if subpaths_map is None: if subpaths_file is not None: subpaths_map = dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths) # print("subpaths_map:", len(subpaths_map)) # print(subpaths_map) else: print 'There is not path for sub-paths, exit!!!' exit(0) # 首先训练模型 cost_time = proxEmbed2.proxEmbedTraining( trainingDataFile, wordsEmbeddings, wordsEmbeddings_path, word_dimension, dimension, wordsSize, subpaths_map, subpaths_file, maxlen_subpaths, h_output_method, maxlen, batch_size, is_shuffle_for_batch, discount_alpha, subpaths_pooling_method, objective_function_method, objective_function_param, lrate, max_epochs, dispFreq, saveFreq, saveto, decay_lstm_W, decay_lstm_U, decay_lstm_b, decay_w, num, dataset_name, class_name, main_dir ) # load the function which is trained beforehand for num_of_group in range(num): num_of_group += 1 suffix = str(num_of_group) index = str(num_of_group) saveto = os.path.join(main_dir + '/', dataset_name + '.trainModels', 'train.' + suffix, 'train_' + class_name + '_' + index + '.npz') # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. computeFunc = proxEmbedProcessAndAssess.get_proxEmbedModel( saveto, word_dimension, dimension, h_output_method, discount_alpha, subpaths_pooling_method, ) # test the model return MnDCG10, MnDCG20 test_data_file = os.path.join(main_dir + '/', dataset_name + '.splits', 'test', 'test_' + class_name + '_' + index) # the file of test data ideal_data_file = os.path.join(main_dir + '/', dataset_name + '.splits', 'ideal', 'ideal_' + class_name + '_' + index) # the file of ground truth test_file.append(test_data_file) MnDCG10, MnDCG20 = proxEmbedProcessAndAssess.compute_proxEmbed( wordsEmbeddings, wordsEmbeddings_path, word_dimension, dimension, wordsSize, subpaths_map, subpaths_file, maxlen_subpaths, maxlen, test_data_file, top_num, ideal_data_file, func=computeFunc, ) print 'MnDCG10==', MnDCG10 print 'MnDCG20==', MnDCG20 NDCG10.append(MnDCG10) NDCG20.append(MnDCG20) return list(zip(test_file, cost_time, NDCG10, NDCG20))
def compute_path2vec( wordsEmbeddings=None, wordsEmbeddings_path='None', typesEmbeddings=None, typesEmbeddings_path='None', word_dimension=0, type_dimension=0, dimension=0, attention_dimension=0, wordsSize=0, subpaths_map=None, subpaths_file='', sequences_map=None, sequences_file='', maxlen_subpaths=1000, maxlen=100, # Sequence longer then this get ignored alpha=0, beta=0, gamma=0, test_data_file='', top_num=10, ideal_data_file='', func=None, ): model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings( wordsEmbeddings_path) else: print 'Exit...' exit(0) if typesEmbeddings is None: if typesEmbeddings_path is not None: typesEmbeddings, type_dimension, wordsSize = dataProcessTools.getTypesEmbeddings( typesEmbeddings_path) else: print 'Exit...' exit(0) sequences_data = dataProcessTools.readAllSequencesFromFile(sequences_file) errCount = 0 line_count = 0 test_map = {} print 'Compute MAP and nDCG for file ', test_data_file with open(test_data_file) as f: for l in f: arr = l.strip().split() query = int(arr[0]) map = {} candidates = [] for i in range(1, len(arr)): key1 = arr[0] + '-' + arr[i] key2 = arr[i] + '-' + arr[0] if key1 in sequences_data or key2 in sequences_data: candidates.append(int(arr[i])) else: map[int(arr[i])] = -1000. errCount += 1 sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, masks_matrix, group_tensor = dataProcessTools.prepareDataForTestBatch( query, candidates, sequences_data, alpha, beta, gamma) if len(sequences_matrix) > 0: scores = func(sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, wordsEmbeddings, typesEmbeddings, masks_matrix, group_tensor) for index in range(len(candidates)): map[candidates[index]] = scores[index] else: for i in range(1, len(arr)): map[int(arr[i])] = -1. tops_in_line = toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count] = tops_in_line line_count += 1 if line_count % 500 == 0: print '+', if line_count % 5000 == 0: print ' time ==', time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) line_count = 0 ideal_map = {} with open(ideal_data_file) as f: for l in f: arr = l.strip().split() arr = [int(x) for x in arr] ideal_map[line_count] = arr[1:] line_count += 1 MAP = evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG = evaluateTools.get_MnDCG(top_num, ideal_map, test_map) print 'errCount =', errCount return MAP, MnDCG
def metagraphAttentionTraining( trainingDataFile=main_dir + 'facebook.splits/train.10/train_classmate_1', # the full path of training data file metagraphEmbeddings_path='', # the file path of metagraph embeddings wordsEmbeddings_data=None, # words embeddings wordsEmbeddings_path=main_dir + 'facebook/nodesFeatures', # the file path of words embeddings wordsSize=1000000, # the size of words vocabulary subpaths_map=None, # contains sub-paths subpaths_file=main_dir + 'facebook/subpathsSaveFile', # the file which contains sub-paths maxlen_subpaths=1000, # the max length for sub-paths maxlen=100, # Sequence longer then this get ignored batch_size=10, # use a batch for training. This is the size of this batch. is_shuffle_for_batch=True, # if need shuffle for training objective_function_method='sigmoid', # loss function, we use sigmoid here objective_function_param=0, # the parameter in loss function, beta lrate=0.0001, # learning rate max_epochs=100, # the max epochs for training dispFreq=5, # the frequences for display saveFreq=5, # the frequences for saving the parameters saveto=main_dir + 'facebook/path2vec-modelParams.npz', # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. # all dimensions parameters metagraph_embedding_dimension=10, # metagraph embedding dimension dimension_A=10, # the dimension of attention when computing the m-node embedding dimension_lstm=10, # dimension of lstm parameters dimension_B=10, # the dimension of attention when computing the m-path embedding dimension_C=10, # the dimension of attention when computing the m-paths embedding # decay parameters decay_Q_A=0.001, decay_b_A=0.001, decay_eta_A=0.001, decay_lstm_W=0.001, decay_lstm_U=0.001, decay_lstm_b=0.001, decay_Q_B=0.001, decay_b_B=0.001, decay_eta_B=0.001, decay_Q_C=0.001, decay_b_C=0.001, decay_eta_C=0.001, decay_w=0.001, ): # get all parameters model_options = locals().copy() if wordsEmbeddings_data is None: if wordsEmbeddings_path is not None: wordsEmbeddings_data, dimension, wordsSize = dataProcessTools.getWordsEmbeddings( wordsEmbeddings_path) else: print 'There is not path for wordsEmbeddings, exit!!!' exit(0) if subpaths_map is None: if subpaths_file is not None: subpaths_map = dataProcessTools.loadAllSubPathsRomove0Path( subpaths_file, maxlen_subpaths, wordsEmbeddings_data) else: print 'There is not path for sub-paths, exit!!!' exit(0) metagraphEmbedding_data, metagraphDimension, metagraphSize = dataProcessTools.getMetagraphEmbeddings( metagraphEmbeddings_path) trainingData, trainingPairs_data = dataProcessTools.getTrainingData( trainingDataFile) allBatches = dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch) ''' init shared variables ''' params = init_sharedVariables(model_options) tparams = init_tparams(params) print 'Generate models ......' metagraphEmbeddings, trainingParis, subPaths_matrix, subPaths_mask, wordsEmbeddings, cost = subgraphAttentionModelLSTMBatch.metagraphAttentionModel( model_options, tparams) print 'Generate gradients ......' grads = tensor.grad(cost, wrt=list(tparams.values())) print 'Using Adadelta to generate functions ......' this_time = time.time() print 'Start to compile and optimize, time ==', time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(this_time)) lr = tensor.scalar(name='lr') f_grad_shared, f_update = adadelta(lr, tparams, grads, metagraphEmbeddings, trainingParis, subPaths_matrix, subPaths_mask, wordsEmbeddings, cost) print 'Start training models ......' best_p = None history_cost = [] # not use start_time = time.time() print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)) uidx = 0 for eidx in range(max_epochs): for _, batch in allBatches: uidx += 1 # prepare data for this model trainingDataForBatch = [trainingData[i] for i in batch] trainingPairsForBatch = [trainingPairs_data[i] for i in batch] triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data = dataProcessTools.prepareDataForTraining( trainingDataForBatch, trainingPairsForBatch, subpaths_map) cost = 0 cost = f_grad_shared(metagraphEmbedding_data, triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, wordsEmbeddings_data) f_update(lrate) trainingDataForBatch = None trainingPairsForBatch = None del triples_matrix_data del subPaths_matrix_data del subPaths_mask_data del subPaths_lens_data if numpy.isnan(cost) or numpy.isinf(cost): print('bad cost detected: ', cost) return if numpy.mod(uidx, dispFreq) == 0: print 'Epoch =', eidx, ', Update =', uidx, ', Cost =', cost this_time = time.time() print 'Time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(this_time)) if saveto and numpy.mod(uidx, saveFreq) == 0: print('Saving...') if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_cost, **params) pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print('Done') gc.collect() end_time = time.time() print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time)) print 'Training finished! Cost time == ', end_time - start_time, ' s'
def compute_path2vec( wordsEmbeddings=None, wordsEmbeddings_path='None', typesEmbeddings=None, typesEmbeddings_path='None', word_dimension=0, type_dimension=0, dimension=0, attention_dimension=0, wordsSize=0, subpaths_map=None, subpaths_file='', sequences_map=None, sequences_file='', maxlen_subpaths=1000, maxlen=100, alpha=0, beta=0, gamma=0, test_data_file='', top_num=10, ideal_data_file='', func=None, ): model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) else: exit(0) if typesEmbeddings is None: if typesEmbeddings_path is not None: typesEmbeddings,type_dimension,wordsSize=dataProcessTools.getTypesEmbeddings(typesEmbeddings_path) else: exit(0) sequences_data=dataProcessTools.readAllSequencesFromFile(sequences_file) errCount=0 line_count=0 test_map={} print 'Compute MAP and nDCG for file ',test_data_file with open(test_data_file) as f: for l in f: arr=l.strip().split() query=int(arr[0]) map={} for i in range(1,len(arr)): candidate=int(arr[i]) sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix=dataProcessTools.prepareDataForTest(query, candidate, sequences_data, alpha, beta, gamma) if sequences_matrix is None or len(sequences_matrix)==0: map[candidate]=-1000. errCount+=1 else: value=func(sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix,wordsEmbeddings,typesEmbeddings) map[candidate]=value tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count]=tops_in_line line_count+=1 line_count=0 ideal_map={} with open(ideal_data_file) as f: for l in f: arr=l.strip().split() arr=[int(x) for x in arr] ideal_map[line_count]=arr[1:] line_count+=1 MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map) print 'errCount =',errCount return MAP,MnDCG
def compute_proxEmbed( wordsEmbeddings=None, # words embeddings wordsEmbeddings_path=None, # the file path of words embeddings word_dimension=0, # dimension of words embeddings dimension=0, # the dimension of paths embeddings wordsSize=0, # the size of words vocabulary subpaths_map=None, # contains sub-paths subpaths_file=None, # the file which contains sub-paths maxlen_subpaths=1000, # the max length for sub-paths maxlen=100, # Sequence longer then this get ignored test_data_file='', # the file path of test data top_num=10, # the top num to predict ideal_data_file='', # ground truth func=None, # model function ): """ compute the result of the model """ model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) else: print 'There is not path for wordsEmbeddings, exit!!!' exit(0) if subpaths_map is None: if subpaths_file is not None: subpaths_map = dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths) else: print 'There is not path for sub-paths, exit!!!' exit(0) line_count = 0 test_map = {} print 'Compute MAP and nDCG for file ', test_data_file with open(test_data_file) as f: for l in f: arr = l.strip().split() query = int(arr[0]) map = {} count_none = 0 count_have = 0 for i in range(1, len(arr)): candidate = int(arr[i]) subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data = dataProcessTools.prepareDataForTestAsymmetric( query, candidate, subpaths_map) if subPaths_matrix_data is None and subPaths_mask_data is None and subPaths_lens_data is None: map[candidate] = -1000. # print(candidate, "subPaths_data is None") count_none += 1 else: count_have += 1 value = func(subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data, wordsEmbeddings) map[candidate] = value print("count_none", count_none, "count_have", count_have) tops_in_line = toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count] = tops_in_line line_count += 1 # print("map:", map) line_count = 0 ideal_map = {} with open(ideal_data_file) as f: for l in f: arr = l.strip().split() arr = [int(x) for x in arr] ideal_map[line_count] = arr[1:] line_count += 1 # MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG10 = evaluateTools.get_MnDCG(10, ideal_map, test_map) MnDCG20 = evaluateTools.get_MnDCG(20, ideal_map, test_map) # print("top_num:", top_num) # print("ideal_map:", ideal_map) # print("test_map:", test_map) return MnDCG10, MnDCG20
def proxEmbedTraining( trainingDataFile=main_dir + 'facebook.splits/train.10/train_classmate_1', # the full path of training data file wordsEmbeddings=None, # words embeddings wordsEmbeddings_path=main_dir + 'facebook/nodesFeatures', # the file path of words embeddings word_dimension=22, # dimension of words embeddings dimension=64, # the dimension of paths embeddings wordsSize=1000000, # the size of words vocabulary subpaths_map=None, # contains sub-paths subpaths_file=main_dir + 'facebook/subpathsSaveFile', # the file which contains sub-paths maxlen_subpaths=1000, # the max length for sub-paths h_output_method='mean-pooling', # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path. maxlen=100, # Sequence longer then this get ignored batch_size=1, # use a batch for training. This is the size of this batch. is_shuffle_for_batch=False, # if need shuffle for training discount_alpha=0.1, # the parameter alpha for discount. The longer the subpath, the little will the weight be. subpaths_pooling_method='max-pooling', # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling. objective_function_method='hinge-loss', # loss function, we use sigmoid objective_function_param=0, # the parameter in loss function, beta lrate=0.0001, # learning rate max_epochs=10, # the max epochs for training dispFreq=5, # the frequences for display saveFreq=5, # the frequences for saving the parameters saveto=main_dir + 'facebook/proxEmbed-modelParams.npz', # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. # the normalization of this model, l2-norm of all parameters decay_lstm_W=0.01, decay_lstm_U=0.01, decay_lstm_b=0.01, decay_w=0.01, ): """ The training stage of ProxEmbed """ model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings( wordsEmbeddings_path) else: print 'There is not path for wordsEmbeddings, exit!!!' exit(0) if subpaths_map is None: if subpaths_file is not None: subpaths_map = dataProcessTools.loadAllSubPaths( subpaths_file, maxlen_subpaths) else: print 'There is not path for sub-paths, exit!!!' exit(0) trainingData, trainingPairs = dataProcessTools.getTrainingData( trainingDataFile) allBatches = dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch) params = init_sharedVariables(model_options) tparams = init_tparams(params) print 'Generate models ......' trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost = proxEmbedModelMulti.proxEmbedModel( model_options, tparams) print 'Generate gradients ......' grads = tensor.grad(cost, wrt=list(tparams.values())) print 'Using Adadelta to generate functions ......' lr = tensor.scalar(name='lr') f_grad_shared, f_update = adadelta(lr, tparams, grads, trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost) print 'Start training models ......' best_p = None history_cost = [] models_count = [0, 0, 0, 0] start_time = time.time() print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)) uidx = 0 for eidx in range(max_epochs): for _, batch in allBatches: uidx += 1 trainingDataForBatch = [trainingData[i] for i in batch] trainingPairsForBatch = [trainingPairs[i] for i in batch] triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data = dataProcessTools.prepareDataForTraining( trainingDataForBatch, trainingPairsForBatch, subpaths_map) cost = 0 cost = f_grad_shared(triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data, wordsEmbeddings) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print('bad cost detected: ', cost) return if numpy.mod(uidx, dispFreq) == 0: print 'Epoch =', eidx, ', Update =', uidx, ', Cost =', cost print 'models_count ==', models_count if saveto and numpy.mod(uidx, saveFreq) == 0: print('Saving...') if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_cost, **params) pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print('Done') end_time = time.time() print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time)) print 'Training finished! Cost time == ', end_time - start_time, ' s'
def proxEmbedTraining( trainingDataFile=main_dir + 'facebook.splits/train.10/train_classmate_1', # the full path of training data file wordsEmbeddings=None, # words embeddings wordsEmbeddings_path=main_dir + 'facebook/nodesFeatures', # the file path of words embeddings word_dimension=22, # dimension of words embeddings dimension=64, # the dimension of paths embeddings wordsSize=1000000, # the size of words vocabulary subpaths_map=None, # contains sub-paths subpaths_file=main_dir + 'facebook/subpathsSaveFile', # the file which contains sub-paths maxlen_subpaths=1000, # the max length for sub-paths h_output_method='mean-pooling', # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path. maxlen=100, # Sequence longer then this get ignored batch_size=1, # use a batch for training. This is the size of this batch. is_shuffle_for_batch=False, # if need shuffle for training discount_alpha=0.1, # the parameter alpha for discount. The longer the subpath, the little will the weight be. subpaths_pooling_method='max-pooling', # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling. objective_function_method='hinge-loss', # loss function, we use sigmoid objective_function_param=0, # the parameter in loss function, beta lrate=0.0001, # learning rate max_epochs=10, # the max epochs for training dispFreq=5, # the frequences for display saveFreq=5, # the frequences for saving the parameters saveto=main_dir + 'facebook/proxEmbed-modelParams.npz', # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. # the normalization of this model, l2-norm of all parameters decay_lstm_W=0.01, decay_lstm_U=0.01, decay_lstm_b=0.01, decay_w=0.01, num_group=0, dataset_name="", class_name="", main_dir = "" ): """ The training stage of ProxEmbed """ model_options = locals().copy() model_options.pop('wordsEmbeddings') print(model_options) if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) # print("wordsEmbeddings:", wordsEmbeddings.shape, dimension, wordsSize) else: print 'There is not path for wordsEmbeddings, exit!!!' exit(0) if subpaths_map is None: if subpaths_file is not None: subpaths_map = dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths) # print("subpaths_map:", len(subpaths_map)) # print(subpaths_map) else: print 'There is not path for sub-paths, exit!!!' exit(0) cost_time = [] for num_of_group in range(num_group): num_of_group += 1 suffix = str(num_of_group) index = str(num_of_group) trainingDataFile = os.path.join(main_dir + '/', dataset_name + '.splits', "train." + suffix, 'train_' + class_name + '_' + '1') saveto = os.path.join(main_dir + '/', dataset_name + '.trainModels', 'train.' + suffix, 'train_' + class_name + '_' + index + '.npz') trainingData, trainingPairs = dataProcessTools.getTrainingData(trainingDataFile) allBatches = dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch) params = init_sharedVariables(model_options) tparams = init_tparams(params) print 'Generate models ......' trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost = proxEmbedModelMulti.proxEmbedModel( model_options, tparams) print("trainingParis:", type(trainingParis), trainingParis.shape) print 'Generate gradients ......' grads = tensor.grad(cost, wrt=list(tparams.values())) print 'Using Adadelta to generate functions ......' lr = tensor.scalar(name='lr') f_grad_shared, f_update = adadelta(lr, tparams, grads, trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost) print 'Start training models ......' best_p = None history_cost = [] models_count = [0, 0, 0, 0] start_time = time.time() print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)) uidx = 0 for eidx in range(max_epochs): for _, batch in allBatches: uidx += 1 trainingDataForBatch = [trainingData[i] for i in batch] trainingPairsForBatch = [trainingPairs[i] for i in batch] triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data = dataProcessTools.prepareDataForTraining( trainingDataForBatch, trainingPairsForBatch, subpaths_map) cost = 0 cost = f_grad_shared(triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data, wordsEmbeddings) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print('bad cost detected: ', cost) return if numpy.mod(uidx, dispFreq) == 0: print 'Epoch =', eidx, ', Update =', uidx, ', Cost =', cost print 'models_count ==', models_count if saveto and numpy.mod(uidx, saveFreq) == 0: print('Saving...') if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_cost, **params) pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print('Done') end_time = time.time() print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time)) print 'Training finished! Cost time == ', end_time - start_time, ' s' cost_time.append(end_time - start_time) return cost_time