Esempio n. 1
0
 def discardRows(self, toDiscard):
      
     nTerms = self._getNumberOfTerms() - len(toDiscard)
     nDocs = self._getNumberOfDocuments()
     
     newMatrix = TermDocumentMatrix()
     newMatrix.setDimensions(nTerms, nDocs)
     newMatrix.setDocuments(self.docNames)
     
     for n in range(nDocs):
         doc = self._getNthDocument(n)
         for term in doc.keys():
             if self.termDocumentMatrix.term2Index[term] in toDiscard: continue
             
             if not newMatrix.isTermKnown(term):
                 newMatrix.addTerm(term)
             print (term, n)
             newMatrix.incCoefficient(term, n, doc[term])
 
     self.termDocumentMatrix = newMatrix
Esempio n. 2
0
    def _createTermDocMatrixForSink(self, sinkName):

        functionNames = self._getFunctionNamesFromSink(sinkName)

        colIndices = [
            self.globalTermDocMatrix.doc2Index[f] for f in functionNames
        ]
        matrix = self.globalTermDocMatrix.matrix.tocsc()

        newIndex2Doc = functionNames
        newDoc2Index = {}
        for i in xrange(len(functionNames)):
            newDoc2Index[functionNames[i]] = i

        newMatrix = matrix[:, colIndices].tolil()
        newTermDocMatrix = TermDocumentMatrix()
        newTermDocMatrix.matrix = newMatrix
        newTermDocMatrix.index2Doc = newIndex2Doc
        newTermDocMatrix.doc2Index = newDoc2Index
        newTermDocMatrix.term2Index = self.globalTermDocMatrix.term2Index
        newTermDocMatrix.index2Term = self.globalTermDocMatrix.index2Term
        newTermDocMatrix.nterms = len(newTermDocMatrix.index2Term)
        return newTermDocMatrix
Esempio n. 3
0
 def _createTermDocMatrixForSink(self, sinkName):
     
     functionNames = self._getFunctionNamesFromSink(sinkName)
     
     colIndices = [self.globalTermDocMatrix.doc2Index[f] for f in functionNames]
     matrix = self.globalTermDocMatrix.matrix.tocsc()
     
     newIndex2Doc = functionNames
     newDoc2Index = {}
     for i in xrange(len(functionNames)):
         newDoc2Index[functionNames[i]] = i
     
     newMatrix = matrix[:,colIndices].tolil()
     newTermDocMatrix = TermDocumentMatrix()
     newTermDocMatrix.matrix = newMatrix
     newTermDocMatrix.index2Doc = newIndex2Doc
     newTermDocMatrix.doc2Index = newDoc2Index
     newTermDocMatrix.term2Index = self.globalTermDocMatrix.term2Index
     newTermDocMatrix.index2Term = self.globalTermDocMatrix.index2Term
     newTermDocMatrix.nterms = len(newTermDocMatrix.index2Term)
     return newTermDocMatrix
Esempio n. 4
0
 def __init__(self):
     self.termDocumentMatrix = TermDocumentMatrix()
Esempio n. 5
0
class NameDictMapToMatrix():
    
    def __init__(self):
        self.termDocumentMatrix = TermDocumentMatrix()
    
    def _openNameDictMap(self, filename):
        self.nameDictMap = pickle.load(open(filename, 'rb'))
        self.docNames = list(self.nameDictMap.d.keys())
    
    def _openAllSymbolsDict(self, filename):
        self.allSymbolsDict = pickle.load(open(filename, 'rb'))
    
    def convertFromFiles(self, nameDictMapFilename, allSymbolsFilename):
        self._openNameDictMap(nameDictMapFilename)
        self._openAllSymbolsDict(allSymbolsFilename)
        self.convert()
    
    def convertFromDicts(self, nameDictMap, allSymbolsDict):
        self.nameDictMap = nameDictMap
        self.docNames = list(self.nameDictMap.d.keys())
        self.allSymbolsDict = allSymbolsDict
        self.convert()
        
    def convert(self):
        numberOfTerms = self._getNumberOfTerms()
        numberOfDocuments = self._getNumberOfDocuments()
                
        self.termDocumentMatrix.setDimensions(numberOfTerms, numberOfDocuments)
        self.termDocumentMatrix.setDocuments(self.docNames)
        
        for n in range(numberOfDocuments):
            doc = self._getNthDocument(n)
            for term in doc.keys():
                if not self.termDocumentMatrix.isTermKnown(term):
                    self.termDocumentMatrix.addTerm(term)
                self.termDocumentMatrix.incCoefficient(term, n, doc[term])
    
    def discardRows(self, toDiscard):
         
        nTerms = self._getNumberOfTerms() - len(toDiscard)
        nDocs = self._getNumberOfDocuments()
        
        newMatrix = TermDocumentMatrix()
        newMatrix.setDimensions(nTerms, nDocs)
        newMatrix.setDocuments(self.docNames)
        
        for n in range(nDocs):
            doc = self._getNthDocument(n)
            for term in doc.keys():
                if self.termDocumentMatrix.term2Index[term] in toDiscard: continue
                
                if not newMatrix.isTermKnown(term):
                    newMatrix.addTerm(term)
                print (term, n)
                newMatrix.incCoefficient(term, n, doc[term])
    
        self.termDocumentMatrix = newMatrix
                    
    def save(self, projectRoot):
        pickle.dump(self.termDocumentMatrix, open(projectRoot + 'termDocMatrix.pickl', 'wb'), protocol=2)
    
    def getDimensions(self):
        return self.termDocumentMatrix.matrix.shape
    
    def _getNthDocument(self, n):
        return self.nameDictMap.d[self.docNames[n]]
    
    def _getNumberOfTerms(self):
        return self.allSymbolsDict.getNumberOfEntries()
    
    def _getNumberOfDocuments(self):
        return self.nameDictMap.getNumberOfEntries()
Esempio n. 6
0
 def __init__(self):
     self.termDocumentMatrix = TermDocumentMatrix()
Esempio n. 7
0
    def discardRows(self, toDiscard):

        nTerms = self._getNumberOfTerms() - len(toDiscard)
        nDocs = self._getNumberOfDocuments()

        newMatrix = TermDocumentMatrix()
        newMatrix.setDimensions(nTerms, nDocs)
        newMatrix.setDocuments(self.docNames)

        for n in range(nDocs):
            doc = self._getNthDocument(n)
            for term in doc.keys():
                if self.termDocumentMatrix.term2Index[term] in toDiscard:
                    continue

                if not newMatrix.isTermKnown(term):
                    newMatrix.addTerm(term)
                print(term, n)
                newMatrix.incCoefficient(term, n, doc[term])

        self.termDocumentMatrix = newMatrix
Esempio n. 8
0
class NameDictMapToMatrix():
    def __init__(self):
        self.termDocumentMatrix = TermDocumentMatrix()

    def _openNameDictMap(self, filename):
        self.nameDictMap = pickle.load(open(filename, 'rb'))
        self.docNames = list(self.nameDictMap.d.keys())

    def _openAllSymbolsDict(self, filename):
        self.allSymbolsDict = pickle.load(open(filename, 'rb'))

    def convertFromFiles(self, nameDictMapFilename, allSymbolsFilename):
        self._openNameDictMap(nameDictMapFilename)
        self._openAllSymbolsDict(allSymbolsFilename)
        self.convert()

    def convertFromDicts(self, nameDictMap, allSymbolsDict):
        self.nameDictMap = nameDictMap
        self.docNames = list(self.nameDictMap.d.keys())
        self.allSymbolsDict = allSymbolsDict
        self.convert()

    def convert(self):
        numberOfTerms = self._getNumberOfTerms()
        numberOfDocuments = self._getNumberOfDocuments()

        self.termDocumentMatrix.setDimensions(numberOfTerms, numberOfDocuments)
        self.termDocumentMatrix.setDocuments(self.docNames)

        for n in range(numberOfDocuments):
            doc = self._getNthDocument(n)
            for term in doc.keys():
                if not self.termDocumentMatrix.isTermKnown(term):
                    self.termDocumentMatrix.addTerm(term)
                self.termDocumentMatrix.incCoefficient(term, n, doc[term])

    def discardRows(self, toDiscard):

        nTerms = self._getNumberOfTerms() - len(toDiscard)
        nDocs = self._getNumberOfDocuments()

        newMatrix = TermDocumentMatrix()
        newMatrix.setDimensions(nTerms, nDocs)
        newMatrix.setDocuments(self.docNames)

        for n in range(nDocs):
            doc = self._getNthDocument(n)
            for term in doc.keys():
                if self.termDocumentMatrix.term2Index[term] in toDiscard:
                    continue

                if not newMatrix.isTermKnown(term):
                    newMatrix.addTerm(term)
                print(term, n)
                newMatrix.incCoefficient(term, n, doc[term])

        self.termDocumentMatrix = newMatrix

    def save(self, projectRoot):
        pickle.dump(self.termDocumentMatrix,
                    open(projectRoot + 'termDocMatrix.pickl', 'wb'),
                    protocol=2)

    def getDimensions(self):
        return self.termDocumentMatrix.matrix.shape

    def _getNthDocument(self, n):
        return self.nameDictMap.d[self.docNames[n]]

    def _getNumberOfTerms(self):
        return self.allSymbolsDict.getNumberOfEntries()

    def _getNumberOfDocuments(self):
        return self.nameDictMap.getNumberOfEntries()