コード例 #1
0
    def interpretation_vector(self, text):
        '''Converts a text fragment string into a row vector where the i'th
        entry corresponds to the total TF-IDF score of the text fragment
        for concept i'''

        #Remove mess (quotes, parentheses etc) from text
        text = self.clean(text)

        #Convert string to hash like {'word' : no. of occurrences}
        countmap = Counter(text.split()).iteritems()

        #Interpretation vector to be returned
        result = sps.csr_matrix((1, self.n_concepts), dtype=float)

        #Add word count in the correct position of the vector
        for word, count in countmap:
            try:
                ind = self.word2index[word]
                #Which file to look in
                file_number = int(ind / row_chunk_size)
                filename = matrix_dir + str(file_number) + extensions['matrix']

                #And which row to extract
                row_number = ind % row_chunk_size

                #Do it! Do it naw!
                with open(filename, 'r') as f:
                    temp = shared.mload(f)
                    result = result + count * temp.getrow(row_number)
            except KeyError:
                pass  #No data on this word -> discard

        #Done. Return row vector as a 1x#concepts CSR matrix
        return result
コード例 #2
0
 def interpretation_vector(self, text):
     '''Converts a text fragment string into a row vector where the i'th
     entry corresponds to the total TF-IDF score of the text fragment
     for concept i'''
     
     #Remove mess (quotes, parentheses etc) from text        
     text = self.clean(text)
     
     #Convert string to hash like {'word' : no. of occurrences}
     countmap = Counter(text.split()).iteritems()
     
     #Interpretation vector to be returned
     result = sps.csr_matrix((1, self.n_concepts), dtype = float)
     
     #Add word count in the correct position of the vector
     for word, count in countmap:
         try:
             ind = self.word2index[word]
             #Which file to look in
             file_number = int(ind/row_chunk_size)
             filename = matrix_dir+str(file_number)+extensions['matrix']
             
             #And which row to extract
             row_number = ind % row_chunk_size
             
             #Do it! Do it naw!
             with open(filename, 'r') as f:
                 temp = shared.mload(f)
                 result = result + count*temp.getrow(row_number)
         except KeyError:
             pass    #No data on this word -> discard
     
     #Done. Return row vector as a 1x#concepts CSR matrix
     return result
コード例 #3
0
ファイル: matrix_builder.py プロジェクト: pangyujin/FYP
    def writeout():
        '''Saves the matrix as small submatrrices in separate files.'''
        for n, submatrix in matrix_chopper(mtx, row_chunk_size):
            filename = matrix_dir + str(n) + extensions['matrix']
            #Update submatrix if it's already partially calculated
            log("Writing out chunk %s" % n)
            try:
                with open(filename, 'r') as f:
                    submatrix = submatrix + shared.mload(f)
                #
            except IOError:
                pass  #File doesn't exist yet, so no need to change mtx

            #Dump the submatrix to file
            with open(filename, 'w') as f:
                shared.mdump(submatrix, f)
        return None
コード例 #4
0
 def writeout():
     '''Saves the matrix as small submatrrices in separate files.'''
     for n, submatrix in matrix_chopper(mtx, row_chunk_size):
         filename = matrix_dir+str(n)+extensions['matrix']
         #Update submatrix if it's already partially calculated
         log("Writing out chunk %s" % n)
         try:
             with open(filename, 'r') as f:
                 submatrix = submatrix + shared.mload(f)
             #
         except IOError:
             pass #File doesn't exist yet, so no need to change mtx
         
         #Dump the submatrix to file
         with open(filename, 'w') as f:
             shared.mdump(submatrix, f)
     return None
コード例 #5
0
ファイル: matrix_builder.py プロジェクト: pangyujin/FYP
def main():
    #Cleanup
    for f in glob.glob(matrix_dir + '/*' + extensions['matrix']):
        os.remove(f)

    #Set pruning parameters
    window_size = shared.window_size
    cutoff = shared.cutoff

    #Read in dicts mapping words and concepts to their respective indices
    log("Reading in word/index data")
    word2index = shared.load(open(matrix_dir + 'word2index.ind', 'r'))
    concept2index = shared.load(open(matrix_dir + 'concept2index.ind', 'r'))
    log("...Done!")

    #==============================================================================
    #     Construct count matrix in small chunks
    #==============================================================================

    #Count words and concepts
    n_words = len(word2index)
    n_concepts = len(concept2index)

    #Determine matrix dimensions
    matrix_shape = (n_words, n_concepts)

    #Allocate sparse matrix. Dict-of-keys should be faster for iterative
    #construction. Convert to csr for fast row operations later.
    mtx = sps.dok_matrix(matrix_shape, dtype=datatype)

    def matrix_chopper(matrix, dim):
        '''Generator to split a huge matrix into small submatrices, which can
        then be stored in individual files.
        This is handy both when constructing the matrix (building the whole
        matrix without saving to files in the process takes about 50 gigs RAM),
        and when applying it, as this allows one to load only the submatrix
        relevant to a given word.'''
        ind = 0
        counter = 0
        rows = matrix.get_shape()[0]
        while ind < rows:
            end = min(ind + dim, rows)
            #Return pair of submatrix number and the submatrix itself
            yield counter, sps.vstack([matrix.getrow(i)\
                                    for i in xrange(ind, end)], format = 'csr')
            counter += 1
            ind += dim

    def writeout():
        '''Saves the matrix as small submatrrices in separate files.'''
        for n, submatrix in matrix_chopper(mtx, row_chunk_size):
            filename = matrix_dir + str(n) + extensions['matrix']
            #Update submatrix if it's already partially calculated
            log("Writing out chunk %s" % n)
            try:
                with open(filename, 'r') as f:
                    submatrix = submatrix + shared.mload(f)
                #
            except IOError:
                pass  #File doesn't exist yet, so no need to change mtx

            #Dump the submatrix to file
            with open(filename, 'w') as f:
                shared.mdump(submatrix, f)
        return None

    log("Constructing matrix.")
    filelist = glob.glob(temp_dir + '*' + extensions['content'])
    files_read = 0
    for filename in filelist:
        with open(filename, 'r') as f:
            content = shared.load(f)

        #Loop over concepts (columns) as so we don't waste time with rare words
        for concept, entry, in content.iteritems():
            #This is the column index (concept w. index j)
            j = concept2index[concept]

            #Convert concept 'countmap' like so: {word : n}
            wordmap = Counter(entry['text'].split()).iteritems()

            #Add them all to the matrix
            for word, count in wordmap:
                #Find row index of the current word
                i = word2index[word]

                #Add the number of times word i occurs in concept j to the matrix
                mtx[i, j] = count
            #
        #Update file count
        files_read += 1
        log("Processed content file no. %s of %s - %s" %
            (files_read, len(filelist) - 1, percentof(files_read,
                                                      len(filelist))))

        if files_read % column_chunk_size == 0:
            mtx = mtx.tocsr()
            writeout()
            mtx = sps.dok_matrix(matrix_shape)
        #

    #Convert matrix to CSR format and write to files.
    mtx = mtx.tocsr()
    writeout()

    #==============================================================================
    # Count matrix/matrices constructed - computing TF-IDF
    #==============================================================================

    log("Done - computing TF-IDF")

    #Grap list of matrix files (containing the submatrices from before)
    matrixfiles = glob.glob(matrix_dir + "*" + extensions['matrix'])
    words_processed = 0  #for logging purposes

    for filename in matrixfiles:
        with open(filename, 'r') as f:
            mtx = shared.mload(f)

        #Number of words in a submatrix
        n_rows = mtx.get_shape()[0]

        for w in xrange(n_rows):
            #Grap non-zero elements from the row corresonding to word w
            row = mtx.data[mtx.indptr[w]:mtx.indptr[w + 1]]
            if len(row) == 0:
                continue

            #Make a vectorized function to convert a full row to TF-IDF
            f = np.vectorize(lambda m_ij: (1 + np.log(m_ij)) * np.log(
                n_concepts / len(row)))

            #Map all elements to TF-IDF and update matrix
            row = f(row)

            #Normalize the row
            assert row.dtype.kind == 'f'  #Non floats round to zero w/o warning
            normfact = 1.0 / np.linalg.norm(row)
            row *= normfact

            #Start inverted index pruning
            if prune:
                #Number of documents containing w
                n_docs = len(row)

                #Don't prune if the windows exceeds the array bounds (duh)
                if window_size < n_docs:

                    #Obtain list of indices such that row[index] is sorted
                    indices = np.argsort(row)[::-1]

                    #Generate a sorted row
                    sorted_row = [row[index] for index in indices]

                    #Go through sorted row and truncate when pruning condition is met
                    for i in xrange(n_docs - window_size):
                        if sorted_row[i +
                                      window_size] >= cutoff * sorted_row[i]:
                            #Truncate, i.e. set the remaining entries to zero
                            sorted_row[i:] = [0] * (n_docs - i)
                            break
                        else:
                            pass

                    #Unsort to original positions
                    for i in xrange(n_docs):
                        row[indices[i]] = sorted_row[i]

            #Update matrix
            mtx.data[mtx.indptr[w]:mtx.indptr[w + 1]] = row

            #Log it
            words_processed += 1
            if words_processed % 10**3 == 0:
                log("Processing word %s of %s - %s" %
                    (words_processed, n_words,
                     percentof(words_processed, n_words)))

        #Keep it sparse - no need to store zeroes
        mtx.eliminate_zeros()
        with open(filename, 'w') as f:
            shared.mdump(mtx, f)

    log("Done!")

    #Notify that the job is done
    if shared.notify:
        try:
            shared.pushme(sys.argv[0] + ' completed.')
        except:
            log("Job's done. Push failed.")

    logfile.close()
    return None
コード例 #6
0
def main():
    #Cleanup
    for f in glob.glob(matrix_dir + '/*'+extensions['matrix']):
        os.remove(f)

    #Set pruning parameters
    window_size = shared.window_size
    cutoff = shared.cutoff
    
    #Read in dicts mapping words and concepts to their respective indices
    log("Reading in word/index data")
    word2index = shared.load(open(matrix_dir+'word2index.ind', 'r'))
    concept2index = shared.load(open(matrix_dir+'concept2index.ind', 'r'))
    log("...Done!")
    
#==============================================================================
#     Construct count matrix in small chunks    
#==============================================================================
    
    #Count words and concepts
    n_words = len(word2index)
    n_concepts = len(concept2index)
    
    #Determine matrix dimensions
    matrix_shape = (n_words, n_concepts)
    
    #Allocate sparse matrix. Dict-of-keys should be faster for iterative
    #construction. Convert to csr for fast row operations later.
    mtx = sps.dok_matrix(matrix_shape, dtype = datatype)
    
    def matrix_chopper(matrix, dim):
        '''Generator to split a huge matrix into small submatrices, which can
        then be stored in individual files.
        This is handy both when constructing the matrix (building the whole
        matrix without saving to files in the process takes about 50 gigs RAM),
        and when applying it, as this allows one to load only the submatrix
        relevant to a given word.'''
        ind = 0
        counter = 0
        rows = matrix.get_shape()[0]
        while ind < rows:
            end = min(ind+dim, rows)
            #Return pair of submatrix number and the submatrix itself
            yield counter, sps.vstack([matrix.getrow(i)\
                                    for i in xrange(ind, end)], format = 'csr')
            counter += 1
            ind += dim
    
    def writeout():
        '''Saves the matrix as small submatrrices in separate files.'''
        for n, submatrix in matrix_chopper(mtx, row_chunk_size):
            filename = matrix_dir+str(n)+extensions['matrix']
            #Update submatrix if it's already partially calculated
            log("Writing out chunk %s" % n)
            try:
                with open(filename, 'r') as f:
                    submatrix = submatrix + shared.mload(f)
                #
            except IOError:
                pass #File doesn't exist yet, so no need to change mtx
            
            #Dump the submatrix to file
            with open(filename, 'w') as f:
                shared.mdump(submatrix, f)
        return None
    
    log("Constructing matrix.")
    filelist = glob.glob(temp_dir + '*'+extensions['content'])
    files_read = 0
    for filename in filelist:
        with open(filename, 'r') as f:
            content = shared.load(f)
        
        #Loop over concepts (columns) as so we don't waste time with rare words
        for concept, entry, in content.iteritems():
            #This is the column index (concept w. index j)
            j = concept2index[concept]
            
            #Convert concept 'countmap' like so: {word : n}
            wordmap = Counter(entry['text'].split()).iteritems()
            
            #Add them all to the matrix
            for word, count in wordmap:
                #Find row index of the current word
                i = word2index[word]
    
                #Add the number of times word i occurs in concept j to the matrix
                mtx[i,j] = count
            #
        #Update file count
        files_read += 1
        log("Processed content file no. %s of %s - %s"
            % (files_read, len(filelist)-1, percentof(files_read, len(filelist))))
        
        if files_read % column_chunk_size == 0:
            mtx = mtx.tocsr()
            writeout()
            mtx = sps.dok_matrix(matrix_shape)
        #
    
    #Convert matrix to CSR format and write to files.
    mtx = mtx.tocsr()
    writeout()

#==============================================================================
# Count matrix/matrices constructed - computing TF-IDF
#==============================================================================

    log("Done - computing TF-IDF")
    
    #Grap list of matrix files (containing the submatrices from before)
    matrixfiles = glob.glob(matrix_dir + "*" + extensions['matrix'])
    words_processed = 0  #for logging purposes    
    
    for filename in matrixfiles:
        with open(filename, 'r') as f:
            mtx = shared.mload(f)
        
        #Number of words in a submatrix
        n_rows = mtx.get_shape()[0]
        
        for w in xrange(n_rows):
            #Grap non-zero elements from the row corresonding to word w
            row = mtx.data[mtx.indptr[w] : mtx.indptr[w+1]]
            if len(row) == 0:
                continue
            
            #Make a vectorized function to convert a full row to TF-IDF
            f = np.vectorize(lambda m_ij: (1+np.log(m_ij))*
                             np.log(n_concepts/len(row)))
    
            #Map all elements to TF-IDF and update matrix
            row = f(row)
            
            #Normalize the row
            assert row.dtype.kind == 'f'  #Non floats round to zero w/o warning
            normfact = 1.0/np.linalg.norm(row)            
            row *= normfact
            
            #Start inverted index pruning
            if prune:                
                #Number of documents containing w
                n_docs = len(row)        
                
                #Don't prune if the windows exceeds the array bounds (duh)
                if window_size < n_docs:
                    
                    #Obtain list of indices such that row[index] is sorted
                    indices = np.argsort(row)[::-1]
            
                    #Generate a sorted row
                    sorted_row = [row[index] for index in indices]
            
                    #Go through sorted row and truncate when pruning condition is met
                    for i in xrange(n_docs-window_size):
                        if sorted_row[i+window_size] >= cutoff*sorted_row[i]:   
                            #Truncate, i.e. set the remaining entries to zero
                            sorted_row[i:] = [0]*(n_docs-i)
                            break
                        else:
                            pass
                        
                    #Unsort to original positions
                    for i in xrange(n_docs):    
                        row[indices[i]] = sorted_row[i]
                
            #Update matrix
            mtx.data[mtx.indptr[w] : mtx.indptr[w+1]] = row
            
            #Log it
            words_processed += 1
            if words_processed % 10**3 == 0:
                log("Processing word %s of %s - %s" % 
                    (words_processed, n_words,
                     percentof(words_processed, n_words)))
        
        #Keep it sparse - no need to store zeroes
        mtx.eliminate_zeros()
        with open(filename, 'w') as f:
            shared.mdump(mtx, f)
    
    log("Done!")
    
    #Notify that the job is done
    if shared.notify:
        try:
            shared.pushme(sys.argv[0]+' completed.')
        except:
            log("Job's done. Push failed.")    
    
    logfile.close()
    return None