def main(fname, num_keys): sorter = mtbl.sorter(merge_func) writer = mtbl.writer(fname, compression=mtbl.COMPRESSION_SNAPPY) a = time.time() last = a total_bytes = 0 count = 0 while count < num_keys: count += 1 key = '%020d' % random.randint(0, sys.maxint) val = random.choice(string.ascii_lowercase) * random.randint(1, 50) sorter[key] = val total_bytes += len(key) + len(val) if (count % report_interval) == 0: b = time.time() last_secs = b - last last = b sys.stderr.write( 'generated %s entries (%s MB) in %s seconds, %s entries/second\n' % (locale.format('%d', count, grouping=True), locale.format('%d', total_bytes / megabyte, grouping=True), locale.format('%f', last_secs, grouping=True), locale.format( '%d', report_interval / last_secs, grouping=True))) sys.stderr.write('writing to output file %s\n' % fname) sorter.write(writer) b = time.time() total_secs = b - a sys.stderr.write( 'wrote %s total entries (%s MB) in %s seconds, %s entries/second\n' % (locale.format('%d', count, grouping=True), locale.format('%d', total_bytes / megabyte, grouping=True), locale.format('%f', total_secs, grouping=True), locale.format('%d', count / total_secs, grouping=True)))
def main(txt_fname, mtbl_fname): txt = open(txt_fname) sorter = mtbl.sorter(merge_func) writer = mtbl.writer(mtbl_fname, compression=mtbl.COMPRESSION_SNAPPY) # trim header while True: line = txt.readline() if line.startswith('*** START OF THIS PROJECT GUTENBERG EBOOK'): break for x in range(0, 5): txt.readline() for line in txt: if line.startswith('End of the Project Gutenberg EBook') or \ line.startswith('*** END OF THIS PROJECT GUTENBERG EBOOK'): break for tok in line.strip().split(): word = tok.strip(string.punctuation).lower() sorter[word] = mtbl.varint_encode(1) sorter.write(writer)
def main(txt_fname, mtbl_fname): txt = open(txt_fname) sorter = mtbl.sorter(merge_func) writer = mtbl.writer(mtbl_fname, compression=mtbl.COMPRESSION_SNAPPY) # trim header while True: line = txt.readline() if line.startswith("*** START OF THIS PROJECT GUTENBERG EBOOK"): break for x in range(0, 5): txt.readline() for line in txt: if line.startswith("End of the Project Gutenberg EBook") or line.startswith( "*** END OF THIS PROJECT GUTENBERG EBOOK" ): break for tok in line.strip().split(): word = tok.strip(string.punctuation).lower() sorter[word] = mtbl.varint_encode(1) sorter.write(writer)
def main(fname, num_keys): sorter = mtbl.sorter(merge_func) writer = mtbl.writer(fname, compression=mtbl.COMPRESSION_SNAPPY) a = time.time() last = a total_bytes = 0 count = 0 while count < num_keys: count += 1 key = '%020d' % random.randint(0, sys.maxint) val = random.choice(string.ascii_lowercase) * random.randint(1, 50) sorter[key] = val total_bytes += len(key) + len(val) if (count % report_interval) == 0: b = time.time() last_secs = b - last last = b sys.stderr.write('generated %s entries (%s MB) in %s seconds, %s entries/second\n' % ( locale.format('%d', count, grouping=True), locale.format('%d', total_bytes / megabyte, grouping=True), locale.format('%f', last_secs, grouping=True), locale.format('%d', report_interval / last_secs, grouping=True) ) ) sys.stderr.write('writing to output file %s\n' % fname) sorter.write(writer) b = time.time() total_secs = b - a sys.stderr.write('wrote %s total entries (%s MB) in %s seconds, %s entries/second\n' % ( locale.format('%d', count, grouping=True), locale.format('%d', total_bytes / megabyte, grouping=True), locale.format('%f', total_secs, grouping=True), locale.format('%d', count / total_secs, grouping=True) ) )