def compute_path2vec( wordsEmbeddings=None, wordsEmbeddings_path='None', typesEmbeddings=None, typesEmbeddings_path='None', word_dimension=0, type_dimension=0, dimension=0, attention_dimension=0, wordsSize=0, subpaths_map=None, subpaths_file='', sequences_map=None, sequences_file='', maxlen_subpaths=1000, maxlen=100, # Sequence longer then this get ignored alpha=0, beta=0, gamma=0, test_data_file='', top_num=10, ideal_data_file='', func=None, ): model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings( wordsEmbeddings_path) else: print 'Exit...' exit(0) if typesEmbeddings is None: if typesEmbeddings_path is not None: typesEmbeddings, type_dimension, wordsSize = dataProcessTools.getTypesEmbeddings( typesEmbeddings_path) else: print 'Exit...' exit(0) sequences_data = dataProcessTools.readAllSequencesFromFile(sequences_file) errCount = 0 line_count = 0 test_map = {} print 'Compute MAP and nDCG for file ', test_data_file with open(test_data_file) as f: for l in f: arr = l.strip().split() query = int(arr[0]) map = {} candidates = [] for i in range(1, len(arr)): key1 = arr[0] + '-' + arr[i] key2 = arr[i] + '-' + arr[0] if key1 in sequences_data or key2 in sequences_data: candidates.append(int(arr[i])) else: map[int(arr[i])] = -1000. errCount += 1 sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, masks_matrix, group_tensor = dataProcessTools.prepareDataForTestBatch( query, candidates, sequences_data, alpha, beta, gamma) if len(sequences_matrix) > 0: scores = func(sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, wordsEmbeddings, typesEmbeddings, masks_matrix, group_tensor) for index in range(len(candidates)): map[candidates[index]] = scores[index] else: for i in range(1, len(arr)): map[int(arr[i])] = -1. tops_in_line = toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count] = tops_in_line line_count += 1 if line_count % 500 == 0: print '+', if line_count % 5000 == 0: print ' time ==', time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) line_count = 0 ideal_map = {} with open(ideal_data_file) as f: for l in f: arr = l.strip().split() arr = [int(x) for x in arr] ideal_map[line_count] = arr[1:] line_count += 1 MAP = evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG = evaluateTools.get_MnDCG(top_num, ideal_map, test_map) print 'errCount =', errCount return MAP, MnDCG
def interactiveGRUTraining( trainingDataFile=main_dir + 'facebook.splits/train.10/train_classmate_1', wordsEmbeddings=None, wordsEmbeddings_path=main_dir + 'facebook/nodesFeatures', typesEmbeddings=None, typesEmbeddings_path='', word_dimension=22, type_dimension=20, dimension=64, attention_dimension=12, wordsSize=1000000, subpaths_map=None, subpaths_file=main_dir + 'facebook/subpathsSaveFile', sequences_map=None, sequences_file='', maxlen_subpaths=1000, maxlen=100, # Sequence longer then this get ignored batch_size=1, is_shuffle_for_batch=False, alpha=0.1, beta=0.1, gamma=0.1, objective_function_method='hinge-loss', objective_function_param=0, lrate=0.0001, max_epochs=10, dispFreq=5, saveFreq=5, saveto=main_dir + 'facebook/path2vec-modelParams.npz', decay=0.01, ): model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings( wordsEmbeddings_path) else: print 'Exit...' exit(0) if typesEmbeddings is None: if typesEmbeddings_path is not None: typesEmbeddings, type_dimension, wordsSize = dataProcessTools.getTypesEmbeddings( typesEmbeddings_path) else: print 'Exit...' exit(0) trainingData, trainingPairsData = dataProcessTools.getTrainingData( trainingDataFile) allBatches = dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch) sequences_data = dataProcessTools.readAllSequencesFromFile(sequences_file) params = init_sharedVariables(model_options) tparams = init_tparams(params) print 'Generate models ......' trainingParis, sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, wordsEmbs, typesEmbs, masks_matrix, groups_tensor, cost = interactiveGRULearningBatch.interactiveGRULearning( model_options, tparams) print 'Generate gradients ......' grads = tensor.grad(cost, wrt=list(tparams.values())) print 'Using Adadelta to generate functions ......' lr = tensor.scalar(name='lr') f_grad_shared, f_update = adadelta( lr, tparams, grads, trainingParis, sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, wordsEmbs, typesEmbs, masks_matrix, groups_tensor, cost) print 'Start training models ......' best_p = None history_cost = [] start_time = time.time() print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) uidx = 0 for eidx in range(max_epochs): for _, batch in allBatches: uidx += 1 trainingDataForBatch = [trainingData[i] for i in batch] trainingPairsForBatch = [trainingPairsData[i] for i in batch] trainingParis_data, sequences_matrix_data, dependency_matrix_data, dependWeight_matrix_data, sequencesLen_vector_data, discountSeq_matrix_data, discountForEachNode_matrix_data, masks_matrix_data, groups_tensor_data = dataProcessTools.prepareDataForTrainingBatch( trainingDataForBatch, trainingPairsForBatch, sequences_data, alpha, beta, gamma) if len(trainingParis_data) == 0: continue cost = f_grad_shared( trainingParis_data, sequences_matrix_data, dependency_matrix_data, dependWeight_matrix_data, sequencesLen_vector_data, discountSeq_matrix_data, discountForEachNode_matrix_data, wordsEmbeddings, typesEmbeddings, masks_matrix_data, groups_tensor_data) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print('bad cost detected: ', cost) return if numpy.mod(uidx, dispFreq) == 0: print 'Epoch =', eidx, ', Update =', uidx, ', Cost =', cost if saveto and numpy.mod(uidx, saveFreq) == 0: print 'Saving... time ==', time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_cost, **params) pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print('Done') end_time = time.time() print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time)) print 'Training finished! Cost time == ', end_time - start_time, ' s'
def compute_path2vec( wordsEmbeddings=None, wordsEmbeddings_path='None', typesEmbeddings=None, typesEmbeddings_path='None', word_dimension=0, type_dimension=0, dimension=0, attention_dimension=0, wordsSize=0, subpaths_map=None, subpaths_file='', sequences_map=None, sequences_file='', maxlen_subpaths=1000, maxlen=100, alpha=0, beta=0, gamma=0, test_data_file='', top_num=10, ideal_data_file='', func=None, ): model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) else: exit(0) if typesEmbeddings is None: if typesEmbeddings_path is not None: typesEmbeddings,type_dimension,wordsSize=dataProcessTools.getTypesEmbeddings(typesEmbeddings_path) else: exit(0) sequences_data=dataProcessTools.readAllSequencesFromFile(sequences_file) errCount=0 line_count=0 test_map={} print 'Compute MAP and nDCG for file ',test_data_file with open(test_data_file) as f: for l in f: arr=l.strip().split() query=int(arr[0]) map={} for i in range(1,len(arr)): candidate=int(arr[i]) sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix=dataProcessTools.prepareDataForTest(query, candidate, sequences_data, alpha, beta, gamma) if sequences_matrix is None or len(sequences_matrix)==0: map[candidate]=-1000. errCount+=1 else: value=func(sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix,wordsEmbeddings,typesEmbeddings) map[candidate]=value tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count]=tops_in_line line_count+=1 line_count=0 ideal_map={} with open(ideal_data_file) as f: for l in f: arr=l.strip().split() arr=[int(x) for x in arr] ideal_map[line_count]=arr[1:] line_count+=1 MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map) print 'errCount =',errCount return MAP,MnDCG