Exemple #1
0
def compute_path2vec(
    wordsEmbeddings=None,
    wordsEmbeddings_path='None',
    typesEmbeddings=None,
    typesEmbeddings_path='None',
    word_dimension=0,
    type_dimension=0,
    dimension=0,
    attention_dimension=0,
    wordsSize=0,
    subpaths_map=None,
    subpaths_file='',
    sequences_map=None,
    sequences_file='',
    maxlen_subpaths=1000,
    maxlen=100,  # Sequence longer then this get ignored 
    alpha=0,
    beta=0,
    gamma=0,
    test_data_file='',
    top_num=10,
    ideal_data_file='',
    func=None,
):
    model_options = locals().copy()

    if wordsEmbeddings is None:
        if wordsEmbeddings_path is not None:
            wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(
                wordsEmbeddings_path)
        else:
            print 'Exit...'
            exit(0)
    if typesEmbeddings is None:
        if typesEmbeddings_path is not None:
            typesEmbeddings, type_dimension, wordsSize = dataProcessTools.getTypesEmbeddings(
                typesEmbeddings_path)
        else:
            print 'Exit...'
            exit(0)

    sequences_data = dataProcessTools.readAllSequencesFromFile(sequences_file)

    errCount = 0

    line_count = 0
    test_map = {}
    print 'Compute MAP and nDCG for file ', test_data_file
    with open(test_data_file) as f:
        for l in f:
            arr = l.strip().split()
            query = int(arr[0])
            map = {}
            candidates = []
            for i in range(1, len(arr)):
                key1 = arr[0] + '-' + arr[i]
                key2 = arr[i] + '-' + arr[0]
                if key1 in sequences_data or key2 in sequences_data:
                    candidates.append(int(arr[i]))
                else:
                    map[int(arr[i])] = -1000.
                    errCount += 1
            sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, masks_matrix, group_tensor = dataProcessTools.prepareDataForTestBatch(
                query, candidates, sequences_data, alpha, beta, gamma)
            if len(sequences_matrix) > 0:
                scores = func(sequences_matrix, dependency_matrix,
                              dependWeight_matrix, sequencesLen_vector,
                              discountSeq_matrix, discountForEachNode_matrix,
                              wordsEmbeddings, typesEmbeddings, masks_matrix,
                              group_tensor)
                for index in range(len(candidates)):
                    map[candidates[index]] = scores[index]
            else:
                for i in range(1, len(arr)):
                    map[int(arr[i])] = -1.

            tops_in_line = toolsFunction.mapSortByValueDESC(map, top_num)
            test_map[line_count] = tops_in_line
            line_count += 1
            if line_count % 500 == 0:
                print '+',
                if line_count % 5000 == 0:
                    print ' time ==', time.strftime(
                        '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    line_count = 0
    ideal_map = {}
    with open(ideal_data_file) as f:
        for l in f:
            arr = l.strip().split()
            arr = [int(x) for x in arr]
            ideal_map[line_count] = arr[1:]
            line_count += 1

    MAP = evaluateTools.get_MAP(top_num, ideal_map, test_map)
    MnDCG = evaluateTools.get_MnDCG(top_num, ideal_map, test_map)

    print 'errCount =', errCount
    return MAP, MnDCG
Exemple #2
0
def interactiveGRUTraining(
    trainingDataFile=main_dir + 'facebook.splits/train.10/train_classmate_1',
    wordsEmbeddings=None,
    wordsEmbeddings_path=main_dir + 'facebook/nodesFeatures',
    typesEmbeddings=None,
    typesEmbeddings_path='',
    word_dimension=22,
    type_dimension=20,
    dimension=64,
    attention_dimension=12,
    wordsSize=1000000,
    subpaths_map=None,
    subpaths_file=main_dir + 'facebook/subpathsSaveFile',
    sequences_map=None,
    sequences_file='',
    maxlen_subpaths=1000,
    maxlen=100,  # Sequence longer then this get ignored 
    batch_size=1,
    is_shuffle_for_batch=False,
    alpha=0.1,
    beta=0.1,
    gamma=0.1,
    objective_function_method='hinge-loss',
    objective_function_param=0,
    lrate=0.0001,
    max_epochs=10,
    dispFreq=5,
    saveFreq=5,
    saveto=main_dir + 'facebook/path2vec-modelParams.npz',
    decay=0.01,
):
    model_options = locals().copy()

    if wordsEmbeddings is None:
        if wordsEmbeddings_path is not None:
            wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(
                wordsEmbeddings_path)
        else:
            print 'Exit...'
            exit(0)
    if typesEmbeddings is None:
        if typesEmbeddings_path is not None:
            typesEmbeddings, type_dimension, wordsSize = dataProcessTools.getTypesEmbeddings(
                typesEmbeddings_path)
        else:
            print 'Exit...'
            exit(0)

    trainingData, trainingPairsData = dataProcessTools.getTrainingData(
        trainingDataFile)
    allBatches = dataProcessTools.get_minibatches_idx(len(trainingData),
                                                      batch_size,
                                                      is_shuffle_for_batch)

    sequences_data = dataProcessTools.readAllSequencesFromFile(sequences_file)

    params = init_sharedVariables(model_options)
    tparams = init_tparams(params)
    print 'Generate models ......'
    trainingParis, sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, wordsEmbs, typesEmbs, masks_matrix, groups_tensor, cost = interactiveGRULearningBatch.interactiveGRULearning(
        model_options, tparams)

    print 'Generate gradients ......'
    grads = tensor.grad(cost, wrt=list(tparams.values()))
    print 'Using Adadelta to generate functions ......'
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = adadelta(
        lr, tparams, grads, trainingParis, sequences_matrix, dependency_matrix,
        dependWeight_matrix, sequencesLen_vector, discountSeq_matrix,
        discountForEachNode_matrix, wordsEmbs, typesEmbs, masks_matrix,
        groups_tensor, cost)

    print 'Start training models ......'
    best_p = None
    history_cost = []

    start_time = time.time()
    print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                         time.localtime(time.time()))
    uidx = 0
    for eidx in range(max_epochs):
        for _, batch in allBatches:
            uidx += 1
            trainingDataForBatch = [trainingData[i] for i in batch]
            trainingPairsForBatch = [trainingPairsData[i] for i in batch]
            trainingParis_data, sequences_matrix_data, dependency_matrix_data, dependWeight_matrix_data, sequencesLen_vector_data, discountSeq_matrix_data, discountForEachNode_matrix_data, masks_matrix_data, groups_tensor_data = dataProcessTools.prepareDataForTrainingBatch(
                trainingDataForBatch, trainingPairsForBatch, sequences_data,
                alpha, beta, gamma)
            if len(trainingParis_data) == 0:
                continue
            cost = f_grad_shared(
                trainingParis_data, sequences_matrix_data,
                dependency_matrix_data, dependWeight_matrix_data,
                sequencesLen_vector_data, discountSeq_matrix_data,
                discountForEachNode_matrix_data, wordsEmbeddings,
                typesEmbeddings, masks_matrix_data, groups_tensor_data)
            f_update(lrate)
            if numpy.isnan(cost) or numpy.isinf(cost):
                print('bad cost detected: ', cost)
                return
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch =', eidx, ',  Update =', uidx, ',  Cost =', cost
            if saveto and numpy.mod(uidx, saveFreq) == 0:
                print 'Saving... time ==', time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_cost, **params)
                pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                print('Done')
    end_time = time.time()
    print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(end_time))
    print 'Training finished! Cost time == ', end_time - start_time, ' s'
Exemple #3
0
def compute_path2vec(
                     wordsEmbeddings=None, 
                     wordsEmbeddings_path='None', 
                     typesEmbeddings=None,
                     typesEmbeddings_path='None',
                     word_dimension=0,
                     type_dimension=0,
                     dimension=0,
                     attention_dimension=0,
                     wordsSize=0,
                     subpaths_map=None, 
                     subpaths_file='',
                     sequences_map=None, 
                     sequences_file='',
                     maxlen_subpaths=1000, 
                     maxlen=100,  
                     alpha=0,
                     beta=0,
                     gamma=0,
                     
                     test_data_file='', 
                     top_num=10,
                     ideal_data_file='', 
                     func=None, 
                   ):
    model_options = locals().copy()
    
    
    if wordsEmbeddings is None:
        if wordsEmbeddings_path is not None: 
            wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
        else: 
            exit(0) 
    if typesEmbeddings is None: 
        if typesEmbeddings_path is not None: 
            typesEmbeddings,type_dimension,wordsSize=dataProcessTools.getTypesEmbeddings(typesEmbeddings_path)
        else: 
            exit(0) 
            
    sequences_data=dataProcessTools.readAllSequencesFromFile(sequences_file)

    errCount=0 

    line_count=0 
    test_map={} 
    print 'Compute MAP and nDCG for file ',test_data_file
    with open(test_data_file) as f:
        for l in f: 
            arr=l.strip().split()
            query=int(arr[0]) 
            map={} 
            for i in range(1,len(arr)): 
                candidate=int(arr[i]) 
                sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix=dataProcessTools.prepareDataForTest(query, candidate, sequences_data, alpha, beta, gamma)
                if sequences_matrix is None or len(sequences_matrix)==0: 
                    map[candidate]=-1000. 
                    errCount+=1
                else: 
                    value=func(sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix,wordsEmbeddings,typesEmbeddings)
                    map[candidate]=value
            
            tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num)
            test_map[line_count]=tops_in_line 
            line_count+=1 
                
    
    line_count=0
    ideal_map={}
    with open(ideal_data_file) as f: 
        for l in f: 
            arr=l.strip().split()
            arr=[int(x) for x in arr] 
            ideal_map[line_count]=arr[1:] 
            line_count+=1 
    
    MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map)
    MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map)
    
    print 'errCount =',errCount
    return MAP,MnDCG