def interpretation_vector(self, text): '''Converts a text fragment string into a row vector where the i'th entry corresponds to the total TF-IDF score of the text fragment for concept i''' #Remove mess (quotes, parentheses etc) from text text = self.clean(text) #Convert string to hash like {'word' : no. of occurrences} countmap = Counter(text.split()).iteritems() #Interpretation vector to be returned result = sps.csr_matrix((1, self.n_concepts), dtype=float) #Add word count in the correct position of the vector for word, count in countmap: try: ind = self.word2index[word] #Which file to look in file_number = int(ind / row_chunk_size) filename = matrix_dir + str(file_number) + extensions['matrix'] #And which row to extract row_number = ind % row_chunk_size #Do it! Do it naw! with open(filename, 'r') as f: temp = shared.mload(f) result = result + count * temp.getrow(row_number) except KeyError: pass #No data on this word -> discard #Done. Return row vector as a 1x#concepts CSR matrix return result
def interpretation_vector(self, text): '''Converts a text fragment string into a row vector where the i'th entry corresponds to the total TF-IDF score of the text fragment for concept i''' #Remove mess (quotes, parentheses etc) from text text = self.clean(text) #Convert string to hash like {'word' : no. of occurrences} countmap = Counter(text.split()).iteritems() #Interpretation vector to be returned result = sps.csr_matrix((1, self.n_concepts), dtype = float) #Add word count in the correct position of the vector for word, count in countmap: try: ind = self.word2index[word] #Which file to look in file_number = int(ind/row_chunk_size) filename = matrix_dir+str(file_number)+extensions['matrix'] #And which row to extract row_number = ind % row_chunk_size #Do it! Do it naw! with open(filename, 'r') as f: temp = shared.mload(f) result = result + count*temp.getrow(row_number) except KeyError: pass #No data on this word -> discard #Done. Return row vector as a 1x#concepts CSR matrix return result
def writeout(): '''Saves the matrix as small submatrrices in separate files.''' for n, submatrix in matrix_chopper(mtx, row_chunk_size): filename = matrix_dir + str(n) + extensions['matrix'] #Update submatrix if it's already partially calculated log("Writing out chunk %s" % n) try: with open(filename, 'r') as f: submatrix = submatrix + shared.mload(f) # except IOError: pass #File doesn't exist yet, so no need to change mtx #Dump the submatrix to file with open(filename, 'w') as f: shared.mdump(submatrix, f) return None
def writeout(): '''Saves the matrix as small submatrrices in separate files.''' for n, submatrix in matrix_chopper(mtx, row_chunk_size): filename = matrix_dir+str(n)+extensions['matrix'] #Update submatrix if it's already partially calculated log("Writing out chunk %s" % n) try: with open(filename, 'r') as f: submatrix = submatrix + shared.mload(f) # except IOError: pass #File doesn't exist yet, so no need to change mtx #Dump the submatrix to file with open(filename, 'w') as f: shared.mdump(submatrix, f) return None
def main(): #Cleanup for f in glob.glob(matrix_dir + '/*' + extensions['matrix']): os.remove(f) #Set pruning parameters window_size = shared.window_size cutoff = shared.cutoff #Read in dicts mapping words and concepts to their respective indices log("Reading in word/index data") word2index = shared.load(open(matrix_dir + 'word2index.ind', 'r')) concept2index = shared.load(open(matrix_dir + 'concept2index.ind', 'r')) log("...Done!") #============================================================================== # Construct count matrix in small chunks #============================================================================== #Count words and concepts n_words = len(word2index) n_concepts = len(concept2index) #Determine matrix dimensions matrix_shape = (n_words, n_concepts) #Allocate sparse matrix. Dict-of-keys should be faster for iterative #construction. Convert to csr for fast row operations later. mtx = sps.dok_matrix(matrix_shape, dtype=datatype) def matrix_chopper(matrix, dim): '''Generator to split a huge matrix into small submatrices, which can then be stored in individual files. This is handy both when constructing the matrix (building the whole matrix without saving to files in the process takes about 50 gigs RAM), and when applying it, as this allows one to load only the submatrix relevant to a given word.''' ind = 0 counter = 0 rows = matrix.get_shape()[0] while ind < rows: end = min(ind + dim, rows) #Return pair of submatrix number and the submatrix itself yield counter, sps.vstack([matrix.getrow(i)\ for i in xrange(ind, end)], format = 'csr') counter += 1 ind += dim def writeout(): '''Saves the matrix as small submatrrices in separate files.''' for n, submatrix in matrix_chopper(mtx, row_chunk_size): filename = matrix_dir + str(n) + extensions['matrix'] #Update submatrix if it's already partially calculated log("Writing out chunk %s" % n) try: with open(filename, 'r') as f: submatrix = submatrix + shared.mload(f) # except IOError: pass #File doesn't exist yet, so no need to change mtx #Dump the submatrix to file with open(filename, 'w') as f: shared.mdump(submatrix, f) return None log("Constructing matrix.") filelist = glob.glob(temp_dir + '*' + extensions['content']) files_read = 0 for filename in filelist: with open(filename, 'r') as f: content = shared.load(f) #Loop over concepts (columns) as so we don't waste time with rare words for concept, entry, in content.iteritems(): #This is the column index (concept w. index j) j = concept2index[concept] #Convert concept 'countmap' like so: {word : n} wordmap = Counter(entry['text'].split()).iteritems() #Add them all to the matrix for word, count in wordmap: #Find row index of the current word i = word2index[word] #Add the number of times word i occurs in concept j to the matrix mtx[i, j] = count # #Update file count files_read += 1 log("Processed content file no. %s of %s - %s" % (files_read, len(filelist) - 1, percentof(files_read, len(filelist)))) if files_read % column_chunk_size == 0: mtx = mtx.tocsr() writeout() mtx = sps.dok_matrix(matrix_shape) # #Convert matrix to CSR format and write to files. mtx = mtx.tocsr() writeout() #============================================================================== # Count matrix/matrices constructed - computing TF-IDF #============================================================================== log("Done - computing TF-IDF") #Grap list of matrix files (containing the submatrices from before) matrixfiles = glob.glob(matrix_dir + "*" + extensions['matrix']) words_processed = 0 #for logging purposes for filename in matrixfiles: with open(filename, 'r') as f: mtx = shared.mload(f) #Number of words in a submatrix n_rows = mtx.get_shape()[0] for w in xrange(n_rows): #Grap non-zero elements from the row corresonding to word w row = mtx.data[mtx.indptr[w]:mtx.indptr[w + 1]] if len(row) == 0: continue #Make a vectorized function to convert a full row to TF-IDF f = np.vectorize(lambda m_ij: (1 + np.log(m_ij)) * np.log( n_concepts / len(row))) #Map all elements to TF-IDF and update matrix row = f(row) #Normalize the row assert row.dtype.kind == 'f' #Non floats round to zero w/o warning normfact = 1.0 / np.linalg.norm(row) row *= normfact #Start inverted index pruning if prune: #Number of documents containing w n_docs = len(row) #Don't prune if the windows exceeds the array bounds (duh) if window_size < n_docs: #Obtain list of indices such that row[index] is sorted indices = np.argsort(row)[::-1] #Generate a sorted row sorted_row = [row[index] for index in indices] #Go through sorted row and truncate when pruning condition is met for i in xrange(n_docs - window_size): if sorted_row[i + window_size] >= cutoff * sorted_row[i]: #Truncate, i.e. set the remaining entries to zero sorted_row[i:] = [0] * (n_docs - i) break else: pass #Unsort to original positions for i in xrange(n_docs): row[indices[i]] = sorted_row[i] #Update matrix mtx.data[mtx.indptr[w]:mtx.indptr[w + 1]] = row #Log it words_processed += 1 if words_processed % 10**3 == 0: log("Processing word %s of %s - %s" % (words_processed, n_words, percentof(words_processed, n_words))) #Keep it sparse - no need to store zeroes mtx.eliminate_zeros() with open(filename, 'w') as f: shared.mdump(mtx, f) log("Done!") #Notify that the job is done if shared.notify: try: shared.pushme(sys.argv[0] + ' completed.') except: log("Job's done. Push failed.") logfile.close() return None
def main(): #Cleanup for f in glob.glob(matrix_dir + '/*'+extensions['matrix']): os.remove(f) #Set pruning parameters window_size = shared.window_size cutoff = shared.cutoff #Read in dicts mapping words and concepts to their respective indices log("Reading in word/index data") word2index = shared.load(open(matrix_dir+'word2index.ind', 'r')) concept2index = shared.load(open(matrix_dir+'concept2index.ind', 'r')) log("...Done!") #============================================================================== # Construct count matrix in small chunks #============================================================================== #Count words and concepts n_words = len(word2index) n_concepts = len(concept2index) #Determine matrix dimensions matrix_shape = (n_words, n_concepts) #Allocate sparse matrix. Dict-of-keys should be faster for iterative #construction. Convert to csr for fast row operations later. mtx = sps.dok_matrix(matrix_shape, dtype = datatype) def matrix_chopper(matrix, dim): '''Generator to split a huge matrix into small submatrices, which can then be stored in individual files. This is handy both when constructing the matrix (building the whole matrix without saving to files in the process takes about 50 gigs RAM), and when applying it, as this allows one to load only the submatrix relevant to a given word.''' ind = 0 counter = 0 rows = matrix.get_shape()[0] while ind < rows: end = min(ind+dim, rows) #Return pair of submatrix number and the submatrix itself yield counter, sps.vstack([matrix.getrow(i)\ for i in xrange(ind, end)], format = 'csr') counter += 1 ind += dim def writeout(): '''Saves the matrix as small submatrrices in separate files.''' for n, submatrix in matrix_chopper(mtx, row_chunk_size): filename = matrix_dir+str(n)+extensions['matrix'] #Update submatrix if it's already partially calculated log("Writing out chunk %s" % n) try: with open(filename, 'r') as f: submatrix = submatrix + shared.mload(f) # except IOError: pass #File doesn't exist yet, so no need to change mtx #Dump the submatrix to file with open(filename, 'w') as f: shared.mdump(submatrix, f) return None log("Constructing matrix.") filelist = glob.glob(temp_dir + '*'+extensions['content']) files_read = 0 for filename in filelist: with open(filename, 'r') as f: content = shared.load(f) #Loop over concepts (columns) as so we don't waste time with rare words for concept, entry, in content.iteritems(): #This is the column index (concept w. index j) j = concept2index[concept] #Convert concept 'countmap' like so: {word : n} wordmap = Counter(entry['text'].split()).iteritems() #Add them all to the matrix for word, count in wordmap: #Find row index of the current word i = word2index[word] #Add the number of times word i occurs in concept j to the matrix mtx[i,j] = count # #Update file count files_read += 1 log("Processed content file no. %s of %s - %s" % (files_read, len(filelist)-1, percentof(files_read, len(filelist)))) if files_read % column_chunk_size == 0: mtx = mtx.tocsr() writeout() mtx = sps.dok_matrix(matrix_shape) # #Convert matrix to CSR format and write to files. mtx = mtx.tocsr() writeout() #============================================================================== # Count matrix/matrices constructed - computing TF-IDF #============================================================================== log("Done - computing TF-IDF") #Grap list of matrix files (containing the submatrices from before) matrixfiles = glob.glob(matrix_dir + "*" + extensions['matrix']) words_processed = 0 #for logging purposes for filename in matrixfiles: with open(filename, 'r') as f: mtx = shared.mload(f) #Number of words in a submatrix n_rows = mtx.get_shape()[0] for w in xrange(n_rows): #Grap non-zero elements from the row corresonding to word w row = mtx.data[mtx.indptr[w] : mtx.indptr[w+1]] if len(row) == 0: continue #Make a vectorized function to convert a full row to TF-IDF f = np.vectorize(lambda m_ij: (1+np.log(m_ij))* np.log(n_concepts/len(row))) #Map all elements to TF-IDF and update matrix row = f(row) #Normalize the row assert row.dtype.kind == 'f' #Non floats round to zero w/o warning normfact = 1.0/np.linalg.norm(row) row *= normfact #Start inverted index pruning if prune: #Number of documents containing w n_docs = len(row) #Don't prune if the windows exceeds the array bounds (duh) if window_size < n_docs: #Obtain list of indices such that row[index] is sorted indices = np.argsort(row)[::-1] #Generate a sorted row sorted_row = [row[index] for index in indices] #Go through sorted row and truncate when pruning condition is met for i in xrange(n_docs-window_size): if sorted_row[i+window_size] >= cutoff*sorted_row[i]: #Truncate, i.e. set the remaining entries to zero sorted_row[i:] = [0]*(n_docs-i) break else: pass #Unsort to original positions for i in xrange(n_docs): row[indices[i]] = sorted_row[i] #Update matrix mtx.data[mtx.indptr[w] : mtx.indptr[w+1]] = row #Log it words_processed += 1 if words_processed % 10**3 == 0: log("Processing word %s of %s - %s" % (words_processed, n_words, percentof(words_processed, n_words))) #Keep it sparse - no need to store zeroes mtx.eliminate_zeros() with open(filename, 'w') as f: shared.mdump(mtx, f) log("Done!") #Notify that the job is done if shared.notify: try: shared.pushme(sys.argv[0]+' completed.') except: log("Job's done. Push failed.") logfile.close() return None