def compute_proxEmbedBySubgraph(
                     wordsEmbeddings=None, 
                     wordsEmbeddings_path=None, 
                     word_dimension=0, 
                     dimension=0,
                     wordsSize=0, 
                     subpaths_map=None, 
                     subpaths_file=None,
                     subgraphs_file='', 
                     maxlen_subpaths=1000, 
                     maxlen=100,  # Sequence longer then this get ignored 
                     
                     test_data_file='', 
                     top_num=10, 
                     ideal_data_file='',
                     func=None, 
                   ):
    model_options = locals().copy()
    
    if wordsEmbeddings is None: 
        if wordsEmbeddings_path is not None: 
            wordsEmbeddings,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
        else: 
            exit(0) 

    subgraphs_map=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphs_file)
    
    line_count=0 
    test_map={} 
    print 'Compute MAP and nDCG for file ',test_data_file
    with open(test_data_file) as f: 
        for l in f: 
            arr=l.strip().split()
            query=int(arr[0]) 
            map={} 
            for i in range(1,len(arr)): 
                candidate=int(arr[i]) 
                sequences_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data,nodesLens_data=dataProcessTools.prepareDataForTestForSubgraphSingleSequenceWithLengthsAsymmetric(query, candidate, subgraphs_map, dimension)
                if sequences_data is None and mask_data is None and lens_data is None: 
                    map[candidate]=-1000. 
                else: 
                    value=func(sequences_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings, buffer_tensor_data, nodesLens_data) 
                    map[candidate]=value
            
            tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num)
            test_map[line_count]=tops_in_line 
            line_count+=1 
                
    line_count=0 
    ideal_map={}
    with open(ideal_data_file) as f: 
        for l in f: 
            arr=l.strip().split()
            arr=[int(x) for x in arr] 
            ideal_map[line_count]=arr[1:] 
            line_count+=1 
    
    MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map)
    MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map)
    
    return MAP,MnDCG
    
    
    
    
    
Example #2
0
def compute_proxEmbed(
        wordsEmbeddings=None,  # words embeddings
        wordsEmbeddings_path=None,  # the file path of words embeddings
        word_dimension=0,  #  dimension of words embeddings
        dimension=0,  # the dimension of paths embeddings
        wordsSize=0,  # the size of words vocabulary
        subpaths_map=None,  # contains sub-paths
        subpaths_file=None,  # the file which contains sub-paths
        maxlen_subpaths=1000,  # the max length for sub-paths
        maxlen=100,  # Sequence longer then this get ignored 
        test_data_file='',  # the file path of test data
        top_num=10,  # the top num to predict
        ideal_data_file='',  # ground truth
        func=None,  # model function
):
    """
    compute the result of the model
    """

    model_options = locals().copy()

    if wordsEmbeddings is None:
        if wordsEmbeddings_path is not None:
            wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(
                wordsEmbeddings_path)
        else:
            print 'There is not path for wordsEmbeddings, exit!!!'
            exit(0)

    if subpaths_map is None:
        if subpaths_file is not None:
            subpaths_map = dataProcessTools.loadAllSubPaths(
                subpaths_file, maxlen_subpaths)
        else:
            print 'There is not path for sub-paths, exit!!!'
            exit(0)

    line_count = 0
    test_map = {}
    print 'Compute MAP and nDCG for file ', test_data_file
    with open(test_data_file) as f:
        for l in f:
            arr = l.strip().split()
            query = int(arr[0])
            map = {}
            for i in range(1, len(arr)):
                candidate = int(arr[i])
                subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data = dataProcessTools.prepareDataForTest(
                    query, candidate, subpaths_map)
                if subPaths_matrix_data is None and subPaths_mask_data is None and subPaths_lens_data is None:
                    map[candidate] = -1000.
                else:
                    value = func(subPaths_matrix_data, subPaths_mask_data,
                                 subPaths_lens_data, wordsEmbeddings)
                    map[candidate] = value

            tops_in_line = toolsFunction.mapSortByValueDESC(map, top_num)
            test_map[line_count] = tops_in_line
            line_count += 1

    line_count = 0
    ideal_map = {}
    with open(ideal_data_file) as f:
        for l in f:
            arr = l.strip().split()
            arr = [int(x) for x in arr]
            ideal_map[line_count] = arr[1:]
            line_count += 1

    MAP = evaluateTools.get_MAP(top_num, ideal_map, test_map)
    MnDCG = evaluateTools.get_MnDCG(top_num, ideal_map, test_map)

    return MAP, MnDCG
Example #3
0
def compute_path2vec(
    wordsEmbeddings=None,
    wordsEmbeddings_path='None',
    typesEmbeddings=None,
    typesEmbeddings_path='None',
    word_dimension=0,
    type_dimension=0,
    dimension=0,
    attention_dimension=0,
    wordsSize=0,
    subpaths_map=None,
    subpaths_file='',
    sequences_map=None,
    sequences_file='',
    maxlen_subpaths=1000,
    maxlen=100,  # Sequence longer then this get ignored 
    alpha=0,
    beta=0,
    gamma=0,
    test_data_file='',
    top_num=10,
    ideal_data_file='',
    func=None,
):
    model_options = locals().copy()

    if wordsEmbeddings is None:
        if wordsEmbeddings_path is not None:
            wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(
                wordsEmbeddings_path)
        else:
            print 'Exit...'
            exit(0)
    if typesEmbeddings is None:
        if typesEmbeddings_path is not None:
            typesEmbeddings, type_dimension, wordsSize = dataProcessTools.getTypesEmbeddings(
                typesEmbeddings_path)
        else:
            print 'Exit...'
            exit(0)

    sequences_data = dataProcessTools.readAllSequencesFromFile(sequences_file)

    errCount = 0

    line_count = 0
    test_map = {}
    print 'Compute MAP and nDCG for file ', test_data_file
    with open(test_data_file) as f:
        for l in f:
            arr = l.strip().split()
            query = int(arr[0])
            map = {}
            candidates = []
            for i in range(1, len(arr)):
                key1 = arr[0] + '-' + arr[i]
                key2 = arr[i] + '-' + arr[0]
                if key1 in sequences_data or key2 in sequences_data:
                    candidates.append(int(arr[i]))
                else:
                    map[int(arr[i])] = -1000.
                    errCount += 1
            sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, masks_matrix, group_tensor = dataProcessTools.prepareDataForTestBatch(
                query, candidates, sequences_data, alpha, beta, gamma)
            if len(sequences_matrix) > 0:
                scores = func(sequences_matrix, dependency_matrix,
                              dependWeight_matrix, sequencesLen_vector,
                              discountSeq_matrix, discountForEachNode_matrix,
                              wordsEmbeddings, typesEmbeddings, masks_matrix,
                              group_tensor)
                for index in range(len(candidates)):
                    map[candidates[index]] = scores[index]
            else:
                for i in range(1, len(arr)):
                    map[int(arr[i])] = -1.

            tops_in_line = toolsFunction.mapSortByValueDESC(map, top_num)
            test_map[line_count] = tops_in_line
            line_count += 1
            if line_count % 500 == 0:
                print '+',
                if line_count % 5000 == 0:
                    print ' time ==', time.strftime(
                        '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    line_count = 0
    ideal_map = {}
    with open(ideal_data_file) as f:
        for l in f:
            arr = l.strip().split()
            arr = [int(x) for x in arr]
            ideal_map[line_count] = arr[1:]
            line_count += 1

    MAP = evaluateTools.get_MAP(top_num, ideal_map, test_map)
    MnDCG = evaluateTools.get_MnDCG(top_num, ideal_map, test_map)

    print 'errCount =', errCount
    return MAP, MnDCG
Example #4
0
def compute_metagraphAttention(
                     wordsEmbeddings=None, # words embeddings
                     wordsEmbeddings_path=None, # the file path of words embeddings
                     metagraphEmbeddings_path=None, # the file path of metagraph embeddings
                     wordsSize=0, # the size of words vocabulary
                     subpaths_map=None, # contains sub-paths
                     subpaths_file=None, # the file which contains sub-paths
                     maxlen_subpaths=1000, # the max length for sub-paths
                     
                     test_data_file='', # test data file
                     top_num=10, # top num in experiments
                     ideal_data_file='', # ideal data file
                     func=None, # the MPE process model
                   ):
    """
        evaluate the MPE model
    """
    model_options = locals().copy()
    
    if wordsEmbeddings is None: 
        if wordsEmbeddings_path is not None: 
            wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
        else: 
            print 'There is not path for wordsEmbeddings, exit!!!'
            exit(0) 

    if subpaths_map is None: 
        if subpaths_file is not None:
            subpaths_map=dataProcessTools.loadAllSubPathsRomove0Path(subpaths_file, maxlen_subpaths, wordsEmbeddings)
        else: 
            print 'There is not path for sub-paths, exit!!!'
            exit(0)
            
    metagraphEmbedding_data, metagraphDimension, metagraphSize=dataProcessTools.getMetagraphEmbeddings(metagraphEmbeddings_path)

    line_count=0 
    test_map={} 
    print 'Compute MAP and nDCG for file ',test_data_file
    with open(test_data_file) as f: 
        for l in f: 
            arr=l.strip().split()
            query=int(arr[0]) 
            map={} 
            for i in range(1,len(arr)): 
                candidate=int(arr[i]) 
                subPaths_matrix_data,subPaths_mask_data,subPaths_lens_data=dataProcessTools.prepareDataForTest(query, candidate, subpaths_map)
                if subPaths_matrix_data is None and subPaths_mask_data is None and subPaths_lens_data is None: 
                    map[candidate]=-1000. 
                else: 
                    value=func(metagraphEmbedding_data, subPaths_matrix_data, subPaths_mask_data, wordsEmbeddings)
                    map[candidate]=value
                del subPaths_matrix_data
                del subPaths_mask_data
                del subPaths_lens_data
            tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num)
            test_map[line_count]=tops_in_line 
            line_count+=1 
            map=None
            gc.collect()
                
    
    line_count=0 
    ideal_map={}
    with open(ideal_data_file) as f: 
        for l in f: 
            arr=l.strip().split()
            arr=[int(x) for x in arr] 
            ideal_map[line_count]=arr[1:] 
            line_count+=1 
    
    MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map)
    MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map)
    
    return MAP,MnDCG
    
    
Example #5
0
def compute_path2vec(
                     wordsEmbeddings=None, 
                     wordsEmbeddings_path='None', 
                     typesEmbeddings=None,
                     typesEmbeddings_path='None',
                     word_dimension=0,
                     type_dimension=0,
                     dimension=0,
                     attention_dimension=0,
                     wordsSize=0,
                     subpaths_map=None, 
                     subpaths_file='',
                     sequences_map=None, 
                     sequences_file='',
                     maxlen_subpaths=1000, 
                     maxlen=100,  
                     alpha=0,
                     beta=0,
                     gamma=0,
                     
                     test_data_file='', 
                     top_num=10,
                     ideal_data_file='', 
                     func=None, 
                   ):
    model_options = locals().copy()
    
    
    if wordsEmbeddings is None:
        if wordsEmbeddings_path is not None: 
            wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
        else: 
            exit(0) 
    if typesEmbeddings is None: 
        if typesEmbeddings_path is not None: 
            typesEmbeddings,type_dimension,wordsSize=dataProcessTools.getTypesEmbeddings(typesEmbeddings_path)
        else: 
            exit(0) 
            
    sequences_data=dataProcessTools.readAllSequencesFromFile(sequences_file)

    errCount=0 

    line_count=0 
    test_map={} 
    print 'Compute MAP and nDCG for file ',test_data_file
    with open(test_data_file) as f:
        for l in f: 
            arr=l.strip().split()
            query=int(arr[0]) 
            map={} 
            for i in range(1,len(arr)): 
                candidate=int(arr[i]) 
                sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix=dataProcessTools.prepareDataForTest(query, candidate, sequences_data, alpha, beta, gamma)
                if sequences_matrix is None or len(sequences_matrix)==0: 
                    map[candidate]=-1000. 
                    errCount+=1
                else: 
                    value=func(sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix,wordsEmbeddings,typesEmbeddings)
                    map[candidate]=value
            
            tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num)
            test_map[line_count]=tops_in_line 
            line_count+=1 
                
    
    line_count=0
    ideal_map={}
    with open(ideal_data_file) as f: 
        for l in f: 
            arr=l.strip().split()
            arr=[int(x) for x in arr] 
            ideal_map[line_count]=arr[1:] 
            line_count+=1 
    
    MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map)
    MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map)
    
    print 'errCount =',errCount
    return MAP,MnDCG