def countRandomBitFrequencies(numTerms=100000, percentSparsity=0.01): """Create a uniformly random counts matrix through sampling.""" # Accumulate counts by inplace-adding sparse matrices counts = SparseMatrix() size = 128 * 128 counts.resize(1, size) # Pre-allocate buffer sparse matrix sparseBitmap = SparseMatrix() sparseBitmap.resize(1, size) random.seed(42) # Accumulate counts for each bit for each word numWords = 0 for term in xrange(numTerms): bitmap = random.sample(xrange(size), int(size * percentSparsity)) bitmap.sort() sparseBitmap.setRowFromSparse(0, bitmap, [1] * len(bitmap)) counts += sparseBitmap numWords += 1 # Compute normalized version of counts as a separate matrix frequencies = SparseMatrix() frequencies.resize(1, size) frequencies.copy(counts) frequencies.divide(float(numWords)) # Wrap up by printing some statistics and then saving the normalized version printFrequencyStatistics(counts, frequencies, numWords, size) frequencyFilename = "bit_frequencies_random.pkl" print "Saving frequency matrix in", frequencyFilename with open(frequencyFilename, "wb") as frequencyPickleFile: pickle.dump(frequencies, frequencyPickleFile) return counts
def countRandomBitFrequencies(numTerms = 100000, percentSparsity = 0.01): """Create a uniformly random counts matrix through sampling.""" # Accumulate counts by inplace-adding sparse matrices counts = SparseMatrix() size = 128*128 counts.resize(1, size) # Pre-allocate buffer sparse matrix sparseBitmap = SparseMatrix() sparseBitmap.resize(1, size) random.seed(42) # Accumulate counts for each bit for each word numWords=0 for term in xrange(numTerms): bitmap = random.sample(xrange(size), int(size*percentSparsity)) bitmap.sort() sparseBitmap.setRowFromSparse(0, bitmap, [1]*len(bitmap)) counts += sparseBitmap numWords += 1 # Compute normalized version of counts as a separate matrix frequencies = SparseMatrix() frequencies.resize(1, size) frequencies.copy(counts) frequencies.divide(float(numWords)) # Wrap up by printing some statistics and then saving the normalized version printFrequencyStatistics(counts, frequencies, numWords, size) frequencyFilename = "bit_frequencies_random.pkl" print "Saving frequency matrix in",frequencyFilename with open(frequencyFilename, "wb") as frequencyPickleFile: pickle.dump(frequencies, frequencyPickleFile) return counts
def countBitFrequenciesForTerms(client, lines, acceptanceProbability=0.1, usePlaceholderEncoding=True, percentSparsity=0.0102): # Accumulate counts by inplace-adding sparse matrices skippedWords = {} counts = SparseMatrix() width = RETINA_SIZES[client.retina]["width"] height = RETINA_SIZES[client.retina]["height"] counts.resize(1, width * height) # Pre-allocate buffer sparse matrix sparseBitmap = SparseMatrix() sparseBitmap.resize(1, width * height) # Accumulate counts for each bit for each word numWords = 0 numLines = 0 for line in lines: tokens = TextPreprocess().tokenize(line) for term in tokens: p = random.uniform(0, 1) if p <= acceptanceProbability: if usePlaceholderEncoding: random.seed(term) bitmap = random.sample( xrange(width * height), int(width * height * percentSparsity)) bitmap.sort() random.seed(p) else: try: bitmap = client.getBitmap( term)["fingerprint"]["positions"] except Exception as err: print "Skipping '{}', reason: {}".format( term, str(err)) continue if not bitmap: skippedWords[term] = skippedWords.get(term, 0) + 1 # print "Skipping '{}', reason: empty".format(term) continue sparseBitmap.setRowFromSparse(0, bitmap, [1] * len(bitmap)) counts += sparseBitmap numWords += 1 numLines += 1 if numLines % 1000 == 0: print "...processed=", numLines, "lines and", numWords, "words" # Compute normalized version of counts as a separate matrix frequencies = SparseMatrix() frequencies.resize(1, width * height) frequencies.copy(counts) frequencies.divide(float(numWords)) # Wrap up by printing some statistics and then saving the normalized version print "Processed", numLines, "lines" printFrequencyStatistics(counts, frequencies, numWords, width * height) frequencyFilename = "bit_frequencies_" + client.retina + ".pkl" print "Saving frequency matrix in", frequencyFilename with open(frequencyFilename, "wb") as frequencyPickleFile: pickle.dump(frequencies, frequencyPickleFile) print "These words were skipped N times because of empty bitmap result" print skippedWords return counts
def countBitFrequenciesForTerms(client, lines, acceptanceProbability = 0.1, usePlaceholderEncoding = True, percentSparsity = 0.0102): # Accumulate counts by inplace-adding sparse matrices skippedWords = {} counts = SparseMatrix() width = RETINA_SIZES[client.retina]["width"] height = RETINA_SIZES[client.retina]["height"] counts.resize(1, width*height) # Pre-allocate buffer sparse matrix sparseBitmap = SparseMatrix() sparseBitmap.resize(1, width*height) # Accumulate counts for each bit for each word numWords=0 numLines=0 for line in lines: tokens = TextPreprocess().tokenize(line) for term in tokens: p = random.uniform(0,1) if p <= acceptanceProbability: if usePlaceholderEncoding: random.seed(term) bitmap = random.sample(xrange(width*height), int(width*height*percentSparsity)) bitmap.sort() random.seed(p) else: try: bitmap = client.getBitmap(term)["fingerprint"]["positions"] except Exception as err: print "Skipping '{}', reason: {}".format(term, str(err)) continue if not bitmap: skippedWords[term] = skippedWords.get(term,0)+1 # print "Skipping '{}', reason: empty".format(term) continue sparseBitmap.setRowFromSparse(0, bitmap, [1]*len(bitmap)) counts += sparseBitmap numWords += 1 numLines += 1 if numLines%1000==0: print "...processed=",numLines,"lines and",numWords,"words" # Compute normalized version of counts as a separate matrix frequencies = SparseMatrix() frequencies.resize(1, width*height) frequencies.copy(counts) frequencies.divide(float(numWords)) # Wrap up by printing some statistics and then saving the normalized version print "Processed",numLines,"lines" printFrequencyStatistics(counts, frequencies, numWords, width*height) frequencyFilename = "bit_frequencies_"+client.retina+".pkl" print "Saving frequency matrix in",frequencyFilename with open(frequencyFilename, "wb") as frequencyPickleFile: pickle.dump(frequencies, frequencyPickleFile) print "These words were skipped N times because of empty bitmap result" print skippedWords return counts