Exemple #1
0
def main(fname, num_keys):
    sorter = mtbl.sorter(merge_func)
    writer = mtbl.writer(fname, compression=mtbl.COMPRESSION_SNAPPY)

    a = time.time()
    last = a
    total_bytes = 0
    count = 0
    while count < num_keys:
        count += 1
        key = '%020d' % random.randint(0, sys.maxint)
        val = random.choice(string.ascii_lowercase) * random.randint(1, 50)
        sorter[key] = val
        total_bytes += len(key) + len(val)
        if (count % report_interval) == 0:
            b = time.time()
            last_secs = b - last
            last = b
            sys.stderr.write(
                'generated %s entries (%s MB) in %s seconds, %s entries/second\n'
                % (locale.format('%d', count, grouping=True),
                   locale.format('%d', total_bytes / megabyte, grouping=True),
                   locale.format('%f', last_secs, grouping=True),
                   locale.format(
                       '%d', report_interval / last_secs, grouping=True)))
    sys.stderr.write('writing to output file %s\n' % fname)
    sorter.write(writer)
    b = time.time()
    total_secs = b - a
    sys.stderr.write(
        'wrote %s total entries (%s MB) in %s seconds, %s entries/second\n' %
        (locale.format('%d', count, grouping=True),
         locale.format('%d', total_bytes / megabyte, grouping=True),
         locale.format('%f', total_secs, grouping=True),
         locale.format('%d', count / total_secs, grouping=True)))
Exemple #2
0
def main(txt_fname, mtbl_fname):
    txt = open(txt_fname)
    sorter = mtbl.sorter(merge_func)
    writer = mtbl.writer(mtbl_fname, compression=mtbl.COMPRESSION_SNAPPY)

    # trim header
    while True:
        line = txt.readline()
        if line.startswith('*** START OF THIS PROJECT GUTENBERG EBOOK'):
            break
    for x in range(0, 5):
        txt.readline()

    for line in txt:
        if line.startswith('End of the Project Gutenberg EBook') or \
           line.startswith('*** END OF THIS PROJECT GUTENBERG EBOOK'):
            break
        for tok in line.strip().split():
            word = tok.strip(string.punctuation).lower()
            sorter[word] = mtbl.varint_encode(1)

    sorter.write(writer)
Exemple #3
0
def main(txt_fname, mtbl_fname):
    txt = open(txt_fname)
    sorter = mtbl.sorter(merge_func)
    writer = mtbl.writer(mtbl_fname, compression=mtbl.COMPRESSION_SNAPPY)

    # trim header
    while True:
        line = txt.readline()
        if line.startswith("*** START OF THIS PROJECT GUTENBERG EBOOK"):
            break
    for x in range(0, 5):
        txt.readline()

    for line in txt:
        if line.startswith("End of the Project Gutenberg EBook") or line.startswith(
            "*** END OF THIS PROJECT GUTENBERG EBOOK"
        ):
            break
        for tok in line.strip().split():
            word = tok.strip(string.punctuation).lower()
            sorter[word] = mtbl.varint_encode(1)

    sorter.write(writer)
def main(fname, num_keys):
    sorter = mtbl.sorter(merge_func)
    writer = mtbl.writer(fname, compression=mtbl.COMPRESSION_SNAPPY)

    a = time.time()
    last = a
    total_bytes = 0
    count = 0
    while count < num_keys:
        count += 1
        key = '%020d' % random.randint(0, sys.maxint)
        val = random.choice(string.ascii_lowercase) * random.randint(1, 50)
        sorter[key] = val
        total_bytes += len(key) + len(val)
        if (count % report_interval) == 0:
            b = time.time()
            last_secs = b - last
            last = b
            sys.stderr.write('generated %s entries (%s MB) in %s seconds, %s entries/second\n' % (
                locale.format('%d', count, grouping=True),
                locale.format('%d', total_bytes / megabyte, grouping=True),
                locale.format('%f', last_secs, grouping=True),
                locale.format('%d', report_interval / last_secs, grouping=True)
                )
            )
    sys.stderr.write('writing to output file %s\n' % fname)
    sorter.write(writer)
    b = time.time()
    total_secs = b - a
    sys.stderr.write('wrote %s total entries (%s MB) in %s seconds, %s entries/second\n' % (
        locale.format('%d', count, grouping=True),
        locale.format('%d', total_bytes / megabyte, grouping=True),
        locale.format('%f', total_secs, grouping=True),
        locale.format('%d', count / total_secs, grouping=True)
        )
    )