Ejemplo n.º 1
0
    def prodPreWordEmbedingMat(self, gensimModelPath, wordSequencesList):
        '''
        load the pred word embedding matrix
        '''

        # load gensim wordvec model
        wordVecObj = WordVecOpt(modelPath=gensimModelPath)
        w2vModel = wordVecObj.loadModelfromFile(gensimModelPath)
        w2vVocab = w2vModel.vocab  # pre-load the vocabulary in w2v-model

        # some fixed parameter
        EMBEDDING_DIM = w2vModel.vector_size
        # count all words in word sequence list
        allWords = []
        for sequence in wordSequencesList:
            allWords.extend(sequence)
        allWords = list(set(allWords))
        nb_words = len(allWords)
        print('nb_words: ' + str(nb_words))

        embedding_matrix = numpy.zeros((nb_words, EMBEDDING_DIM))
        for i in range(len(allWords)):
            if allWords[i] in w2vVocab:
                embedding_vector = wordVecObj.getWordVec(w2vModel, allWords[i])
                embedding_matrix[i] = embedding_vector
#                 print('scan sequence word: ' + allWords[i]),
#                 print('vector: ' + str(embedding_vector))

        return nb_words, EMBEDDING_DIM, embedding_matrix
Ejemplo n.º 2
0
    def prodPreWordEmbedingMat(self, gensimModelPath, wordSequencesList):
        '''
        load the pred word embedding matrix
        '''
        
        # load gensim wordvec model
        wordVecObj = WordVecOpt(modelPath=gensimModelPath)
        w2vModel = wordVecObj.loadModelfromFile(gensimModelPath)
        w2vVocab = w2vModel.vocab  # pre-load the vocabulary in w2v-model
        
        # some fixed parameter
        EMBEDDING_DIM = w2vModel.vector_size
        # count all words in word sequence list
        allWords = []
        for sequence in wordSequencesList:
            allWords.extend(sequence)
        allWords = list(set(allWords))
        nb_words = len(allWords)
        print('nb_words: ' + str(nb_words))
        
        embedding_matrix = numpy.zeros((nb_words, EMBEDDING_DIM))
        for i in range(len(allWords)):
            if allWords[i] in w2vVocab:
                embedding_vector = wordVecObj.getWordVec(w2vModel, allWords[i])
                embedding_matrix[i] = embedding_vector
#                 print('scan sequence word: ' + allWords[i]),
#                 print('vector: ' + str(embedding_vector))
            
        return nb_words, EMBEDDING_DIM, embedding_matrix
Ejemplo n.º 3
0
 def buildBasicSemGraph(self, w2vModelPath, allWordList, topN=20, edgeThreshold=0.2, unionRange=60):
     graphOptObj = NeoDataGraphOpt()
     w2vOptObj = WordVecOpt(w2vModelPath)     
     print('ready to build semantic graph!')
     
     nounNodes = self.createBasicEmtityNodes(graphOptObj, allWordList)
     cacheRelationShips = []
     unionCache = 0
     graphRelatSize = 0;
     for i in range(0, len(nounNodes)):
         for j in range(0, len(nounNodes)):
             if i != j:
                 adjRelationShip = self.createBasicRelasBtwNodes(w2vOptObj, graphOptObj, nounNodes[i], nounNodes[j], topN, edgeThreshold)
                 if unionCache < unionRange:
                     cacheRelationShips.append(adjRelationShip)
                     print('add relat to cache pool.')
                     unionCache += 1
                 else:
                     semSubGraph = self.unionSemRelatSubGraph(graphOptObj, cacheRelationShips)
                     self.constructSemGraphOnNeo(graphOptObj, semSubGraph)
                     print('construct subgraph cache range: ' + str(unionRange) + '!')
                     graphRelatSize += unionCache
                     unionCache = 0
                     cacheRelationShips = []
     if unionCache > 0:
         semSubGraph = self.unionSemRelatSubGraph(graphOptObj, cacheRelationShips)
         self.constructSemGraphOnNeo(graphOptObj, semSubGraph)
         print('construct surplus subgraph size: ' + str(unionCache) + '!')
         graphRelatSize += unionCache
     print('construct semgraph on neo size: ' + str(graphRelatSize) + '!')
Ejemplo n.º 4
0
    def buildLexGroupSemGraph(self,
                              w2vModelPath,
                              allWordList,
                              lex_cluster=None,
                              vec_z_ratio=100,
                              canopy_t_ratio=3,
                              topN_rev=20,
                              topN=20,
                              edgeThreshold=0.2,
                              unionRange=60):
        graphOptObj = NeoDataGraphOpt()
        w2vOptObj = WordVecOpt(w2vModelPath)
        print('ready to build lex-group semantic graph!')

        lexGroupNodes = self.createLexClustEmtityNodes(
            graphOptObj,
            w2vOptObj,
            allWordList,
            cluster=lex_cluster,
            vec_z_ratio=vec_z_ratio,
            canopy_t_ratio=canopy_t_ratio)
        cacheRelationShips = []
        unionCache = 0
        graphRelatSize = 0
        for i in range(0, len(lexGroupNodes)):
            for j in range(0, len(lexGroupNodes)):
                if i != j:
                    adjRelationShip = self.createLexGroupRelasBtwNodes(
                        w2vOptObj, graphOptObj, lexGroupNodes[i],
                        lexGroupNodes[j], topN_rev, topN, edgeThreshold)
                    if unionCache < unionRange:
                        cacheRelationShips.append(adjRelationShip)
                        print('add lex-group relat to cache pool.')
                        unionCache += 1
                    else:
                        lexSemSubGraph = self.unionSemRelatSubGraph(
                            graphOptObj, cacheRelationShips)
                        self.constructSemGraphOnNeo(graphOptObj,
                                                    lexSemSubGraph)
                        print('construct lex-graph subgraph cache range: ' +
                              str(unionRange) + '!')
                        graphRelatSize += unionCache
                        unionCache = 0
                        cacheRelationShips = []
        if unionCache > 0:
            lexSemSubGraph = self.unionSemRelatSubGraph(
                graphOptObj, cacheRelationShips)
            self.constructSemGraphOnNeo(graphOptObj, lexSemSubGraph)
            print('construct lex-graph subgraph cache range: ' +
                  str(unionRange) + '!')
            graphRelatSize += unionCache
        print('construct lex-graph semgraph on neo size: ' +
              str(graphRelatSize) + '!')
Ejemplo n.º 5
0
def testRelationBtwEntities():
    entity1 = u'红枣/n'
    entity2 = u'雪梨/n'

    modelStoragePath = ROOT_PATH.root_win64 + u'model\\word2vec\\zongheword2vecModel.vector'

    sim = WordVecOpt(modelStoragePath).culSimBtwWVfromFile(
        entity1, entity2, modelStoragePath)
    print(sim)

    relationWordTuples = freqAssGraphSupOpt.relationBtwEntities(
        modelStoragePath, entity1, entity2, 500, pureFilterTopN=2)
    relationWordTuples2 = freqAssGraphSupOpt.relationBtwEntities(
        modelStoragePath, entity1, entity2, 50, pureFilterTopN=0)

    for relation in relationWordTuples:
        print relation[0], relation[1]
    print('-----------------------------------')
    for relation in relationWordTuples2:
        print relation[0], relation[1]
Ejemplo n.º 6
0
def loadW2VModelFromDisk(modelStoragePath):
    wordVecOptObj = WordVecOpt(modelStoragePath)
    model = wordVecOptObj.loadModelfromFile(modelStoragePath)
    
    return wordVecOptObj, model
Ejemplo n.º 7
0
def prodFieldW2VModel(modelStoragePath, corpusFilePath, dimension_size=100):
    wordVecOptObj = WordVecOpt(modelStoragePath, size=dimension_size)
    model = wordVecOptObj.initTrainWord2VecModel(corpusFilePath)
    
    return wordVecOptObj, model