Esempio n. 1
0
    def getWordVec(self, model, wordStr):
        '''
        get the word's vector as arrayList type from w2v model
        '''
        ExtraSegOpt().reLoadEncoding()

        return model[wordStr]
Esempio n. 2
0
    def copeMSimilarVecsbtwWordLists(self,
                                     model,
                                     wordStrList1,
                                     wordStrList2,
                                     topN_rev=20,
                                     topN=20):
        '''
        range word vec res for two wordList from source to target
        use wordVector to express the relationship between src-wordList and tag-wordList
        first, use the tag-wordList as neg-wordList to get the rev-wordList,
        then use the scr-wordList and the rev-wordList as the new src-tag-wordList
        topN_rev is topN of rev-wordList and topN is the final topN of relationship vec
        '''
        ExtraSegOpt().reLoadEncoding()

        srcWordList = []
        tagWordList = []
        srcWordList.extend(wordStr.decode('utf-8') for wordStr in wordStrList1)
        tagWordList.extend(wordStr.decode('utf-8') for wordStr in wordStrList2)

        revSimilarPairList = self.queryMSimilarVecswithPosNeg(
            model, [], tagWordList, topN_rev)
        revWordList = []
        revWordList.extend(pair[0].decode('utf-8')
                           for pair in revSimilarPairList)
        stSimilarPairList = self.queryMSimilarVecswithPosNeg(
            model, srcWordList, revWordList, topN)
        return stSimilarPairList
Esempio n. 3
0
    def culSimBtwWordVecs(self, model, wordStr1, wordStr2):
        '''
        two words similar basic query function
        return double-prob
        '''
        ExtraSegOpt().reLoadEncoding()

        similarValue = model.similarity(wordStr1.decode('utf-8'),
                                        wordStr2.decode('utf-8'))
        return similarValue
Esempio n. 4
0
    def queryMostSimilarWordVec(self, model, wordStr, topN=20):
        '''
        MSimilar words basic query function
        return 2-dim List [0] is word [1] is double-prob
        '''
        ExtraSegOpt().reLoadEncoding()

        similarPairList = model.most_similar(wordStr.decode('utf-8'),
                                             topn=topN)
        return similarPairList
def listAllFilePathInDirectory(dirPath):
    '''
    list all file_path in a directory from dir folder
    '''
    ExtraSegOpt().reLoadEncoding()

    loadedFilesPath = []
    files = os.listdir(dirPath)
    # TODO need improve code to one line
    for file in files:
        filePath = dirPath + file
        #         print(filePath)

        loadedFilesPath.append(filePath)

    return loadedFilesPath
Esempio n. 6
0
    def queryMSimilarVecswithPosNeg(self,
                                    model,
                                    posWordStrList,
                                    negWordStrList,
                                    topN=20):
        '''
        pos-neg MSimilar words basic query function
        return 2-dim List [0] is word [1] is double-prob
        '''
        ExtraSegOpt().reLoadEncoding()

        posWordList = []
        negWordList = []
        for wordStr in posWordStrList:
            posWordList.append(wordStr.decode('utf-8'))
        for wordStr in negWordStrList:
            negWordList.append(wordStr.decode('utf-8'))
        pnSimilarPairList = model.most_similar(positive=posWordList,
                                               negative=negWordList,
                                               topn=topN)
        return pnSimilarPairList
def folderFilesNameEntities(corpusDirPath,
                            userDictPath=None,
                            dictRewrite=False):
    '''
    get entities from folder files' names
    write these entities into user_dict for jieba analyser(chose)
    '''
    ExtraSegOpt().reLoadEncoding()

    entities = []
    files = os.listdir(corpusDirPath)
    for file in files:
        fileName = file[:file.find(u'(seg)')]
        extra = u''
        if fileName.find(u'(') != -1 and fileName.find(u')') != -1:
            extra = fileName[fileName.find(u'(') + 1:fileName.find(u')')]
        if fileName.find(u'(') != -1 and fileName.find(u')') != -1:
            extra = fileName[fileName.find(u'(') + 1:fileName.find(u')')]
        if len(extra) != 0:
            if fileName.find(extra) - 1 == 0:
                fileName = fileName[fileName.find(extra) + len(extra) + 1:]
            else:
                fileName = fileName[:fileName.find(extra) - 1]
        if fileName not in entities:
            entities.append(fileName)

    # write user's word directory
    if userDictPath != None:
        entitiesFwStr = ''
        for i in range(len(entities)):
            entitiesFwStr += (entities[i] + u' n')
            if not i == len(entities) - 1:
                entitiesFwStr += u'\n'

        mode = 'w' if dictRewrite == False else 'w+'
        fw = open(userDictPath, mode)
        fw.write(entitiesFwStr)
        fw.close()

    return entities
Esempio n. 8
0
    def updateWord2VecModel(self, corpusFilePath, modelFilePath=None):
        '''
        update w2v model from disk
        (about corpusFilePath and safe_model is same as function initTrainWord2VecModel
        default set safe_model == True)
        '''
        ExtraSegOpt().reLoadEncoding()

        fileType = localFileOptUnit.checkFileState(corpusFilePath)
        if fileType == u'error':
            warnings.warn('load file error!')
            return None
        else:
            if modelFilePath == None:
                modelFilePath = self.modelPath
            model = self.loadModelfromFile(modelFilePath)
            # TODO add safe_model == False
            if fileType == u'file' or u'opened':
                self.updateW2VModelUnit(model, corpusFilePath)
            elif fileType == u'directory':
                corpusFiles = localFileOptUnit.listAllFilePathInDirectory(
                    corpusFilePath)
                for file in corpusFiles:
                    self.updateW2VModelUnit(model, file)