def prodPreWordEmbedingMat(self, gensimModelPath, wordSequencesList): ''' load the pred word embedding matrix ''' # load gensim wordvec model wordVecObj = WordVecOpt(modelPath=gensimModelPath) w2vModel = wordVecObj.loadModelfromFile(gensimModelPath) w2vVocab = w2vModel.vocab # pre-load the vocabulary in w2v-model # some fixed parameter EMBEDDING_DIM = w2vModel.vector_size # count all words in word sequence list allWords = [] for sequence in wordSequencesList: allWords.extend(sequence) allWords = list(set(allWords)) nb_words = len(allWords) print('nb_words: ' + str(nb_words)) embedding_matrix = numpy.zeros((nb_words, EMBEDDING_DIM)) for i in range(len(allWords)): if allWords[i] in w2vVocab: embedding_vector = wordVecObj.getWordVec(w2vModel, allWords[i]) embedding_matrix[i] = embedding_vector # print('scan sequence word: ' + allWords[i]), # print('vector: ' + str(embedding_vector)) return nb_words, EMBEDDING_DIM, embedding_matrix
def buildBasicSemGraph(self, w2vModelPath, allWordList, topN=20, edgeThreshold=0.2, unionRange=60): graphOptObj = NeoDataGraphOpt() w2vOptObj = WordVecOpt(w2vModelPath) print('ready to build semantic graph!') nounNodes = self.createBasicEmtityNodes(graphOptObj, allWordList) cacheRelationShips = [] unionCache = 0 graphRelatSize = 0; for i in range(0, len(nounNodes)): for j in range(0, len(nounNodes)): if i != j: adjRelationShip = self.createBasicRelasBtwNodes(w2vOptObj, graphOptObj, nounNodes[i], nounNodes[j], topN, edgeThreshold) if unionCache < unionRange: cacheRelationShips.append(adjRelationShip) print('add relat to cache pool.') unionCache += 1 else: semSubGraph = self.unionSemRelatSubGraph(graphOptObj, cacheRelationShips) self.constructSemGraphOnNeo(graphOptObj, semSubGraph) print('construct subgraph cache range: ' + str(unionRange) + '!') graphRelatSize += unionCache unionCache = 0 cacheRelationShips = [] if unionCache > 0: semSubGraph = self.unionSemRelatSubGraph(graphOptObj, cacheRelationShips) self.constructSemGraphOnNeo(graphOptObj, semSubGraph) print('construct surplus subgraph size: ' + str(unionCache) + '!') graphRelatSize += unionCache print('construct semgraph on neo size: ' + str(graphRelatSize) + '!')
def buildLexGroupSemGraph(self, w2vModelPath, allWordList, lex_cluster=None, vec_z_ratio=100, canopy_t_ratio=3, topN_rev=20, topN=20, edgeThreshold=0.2, unionRange=60): graphOptObj = NeoDataGraphOpt() w2vOptObj = WordVecOpt(w2vModelPath) print('ready to build lex-group semantic graph!') lexGroupNodes = self.createLexClustEmtityNodes( graphOptObj, w2vOptObj, allWordList, cluster=lex_cluster, vec_z_ratio=vec_z_ratio, canopy_t_ratio=canopy_t_ratio) cacheRelationShips = [] unionCache = 0 graphRelatSize = 0 for i in range(0, len(lexGroupNodes)): for j in range(0, len(lexGroupNodes)): if i != j: adjRelationShip = self.createLexGroupRelasBtwNodes( w2vOptObj, graphOptObj, lexGroupNodes[i], lexGroupNodes[j], topN_rev, topN, edgeThreshold) if unionCache < unionRange: cacheRelationShips.append(adjRelationShip) print('add lex-group relat to cache pool.') unionCache += 1 else: lexSemSubGraph = self.unionSemRelatSubGraph( graphOptObj, cacheRelationShips) self.constructSemGraphOnNeo(graphOptObj, lexSemSubGraph) print('construct lex-graph subgraph cache range: ' + str(unionRange) + '!') graphRelatSize += unionCache unionCache = 0 cacheRelationShips = [] if unionCache > 0: lexSemSubGraph = self.unionSemRelatSubGraph( graphOptObj, cacheRelationShips) self.constructSemGraphOnNeo(graphOptObj, lexSemSubGraph) print('construct lex-graph subgraph cache range: ' + str(unionRange) + '!') graphRelatSize += unionCache print('construct lex-graph semgraph on neo size: ' + str(graphRelatSize) + '!')
def testRelationBtwEntities(): entity1 = u'红枣/n' entity2 = u'雪梨/n' modelStoragePath = ROOT_PATH.root_win64 + u'model\\word2vec\\zongheword2vecModel.vector' sim = WordVecOpt(modelStoragePath).culSimBtwWVfromFile( entity1, entity2, modelStoragePath) print(sim) relationWordTuples = freqAssGraphSupOpt.relationBtwEntities( modelStoragePath, entity1, entity2, 500, pureFilterTopN=2) relationWordTuples2 = freqAssGraphSupOpt.relationBtwEntities( modelStoragePath, entity1, entity2, 50, pureFilterTopN=0) for relation in relationWordTuples: print relation[0], relation[1] print('-----------------------------------') for relation in relationWordTuples2: print relation[0], relation[1]
def loadW2VModelFromDisk(modelStoragePath): wordVecOptObj = WordVecOpt(modelStoragePath) model = wordVecOptObj.loadModelfromFile(modelStoragePath) return wordVecOptObj, model
def prodFieldW2VModel(modelStoragePath, corpusFilePath, dimension_size=100): wordVecOptObj = WordVecOpt(modelStoragePath, size=dimension_size) model = wordVecOptObj.initTrainWord2VecModel(corpusFilePath) return wordVecOptObj, model