コード例 #1
0
ファイル: pymtbl_wf_merge.py プロジェクト: edmonds/pymtbl
def main(input_fnames, output_fname):
    merger = mtbl.merger(merge_func)
    writer = mtbl.writer(output_fname, compression=mtbl.COMPRESSION_SNAPPY)
    for fname in input_fnames:
        reader = mtbl.reader(fname)
        merger.add_reader(reader)
    merger.write(writer)
コード例 #2
0
def main(fname, num_keys):
    sorter = mtbl.sorter(merge_func)
    writer = mtbl.writer(fname, compression=mtbl.COMPRESSION_SNAPPY)

    a = time.time()
    last = a
    total_bytes = 0
    count = 0
    while count < num_keys:
        count += 1
        key = '%020d' % random.randint(0, sys.maxint)
        val = random.choice(string.ascii_lowercase) * random.randint(1, 50)
        sorter[key] = val
        total_bytes += len(key) + len(val)
        if (count % report_interval) == 0:
            b = time.time()
            last_secs = b - last
            last = b
            sys.stderr.write(
                'generated %s entries (%s MB) in %s seconds, %s entries/second\n'
                % (locale.format('%d', count, grouping=True),
                   locale.format('%d', total_bytes / megabyte, grouping=True),
                   locale.format('%f', last_secs, grouping=True),
                   locale.format(
                       '%d', report_interval / last_secs, grouping=True)))
    sys.stderr.write('writing to output file %s\n' % fname)
    sorter.write(writer)
    b = time.time()
    total_secs = b - a
    sys.stderr.write(
        'wrote %s total entries (%s MB) in %s seconds, %s entries/second\n' %
        (locale.format('%d', count, grouping=True),
         locale.format('%d', total_bytes / megabyte, grouping=True),
         locale.format('%f', total_secs, grouping=True),
         locale.format('%d', count / total_secs, grouping=True)))
コード例 #3
0
ファイル: pymtbl_wf_merge.py プロジェクト: rep/pymtbl
def main(input_fnames, output_fname):
    merger = mtbl.merger(merge_func)
    writer = mtbl.writer(output_fname, compression=mtbl.COMPRESSION_SNAPPY)
    for fname in input_fnames:
        reader = mtbl.reader(fname)
        merger.add_reader(reader)
    merger.write(writer)
コード例 #4
0
ファイル: pymtbl_merge_tables.py プロジェクト: edmonds/pymtbl
def main(output_fname, input_fnames):
    merger = mtbl.merger(merge_func)
    writer = mtbl.writer(output_fname, compression=mtbl.COMPRESSION_SNAPPY)
    for fname in input_fnames:
        reader = mtbl.reader(fname)
        merger.add_reader(reader)
    for k, v in merger.iteritems():
        writer[k] = v
    writer.close()
コード例 #5
0
def main(output_fname, input_fnames):
    merger = mtbl.merger(merge_func)
    writer = mtbl.writer(output_fname, compression=mtbl.COMPRESSION_SNAPPY)
    for fname in input_fnames:
        reader = mtbl.reader(fname)
        merger.add_reader(reader)
    for k, v in merger.iteritems():
        writer[k] = v
    writer.close()
コード例 #6
0
ファイル: pymtbl_wf_analyze.py プロジェクト: rep/pymtbl
def main(txt_fname, mtbl_fname):
    txt = open(txt_fname)
    sorter = mtbl.sorter(merge_func)
    writer = mtbl.writer(mtbl_fname, compression=mtbl.COMPRESSION_SNAPPY)

    # trim header
    while True:
        line = txt.readline()
        if line.startswith('*** START OF THIS PROJECT GUTENBERG EBOOK'):
            break
    for x in range(0, 5):
        txt.readline()

    for line in txt:
        if line.startswith('End of the Project Gutenberg EBook') or \
           line.startswith('*** END OF THIS PROJECT GUTENBERG EBOOK'):
            break
        for tok in line.strip().split():
            word = tok.strip(string.punctuation).lower()
            sorter[word] = mtbl.varint_encode(1)

    sorter.write(writer)
コード例 #7
0
ファイル: pymtbl_wf_analyze.py プロジェクト: edmonds/pymtbl
def main(txt_fname, mtbl_fname):
    txt = open(txt_fname)
    sorter = mtbl.sorter(merge_func)
    writer = mtbl.writer(mtbl_fname, compression=mtbl.COMPRESSION_SNAPPY)

    # trim header
    while True:
        line = txt.readline()
        if line.startswith("*** START OF THIS PROJECT GUTENBERG EBOOK"):
            break
    for x in range(0, 5):
        txt.readline()

    for line in txt:
        if line.startswith("End of the Project Gutenberg EBook") or line.startswith(
            "*** END OF THIS PROJECT GUTENBERG EBOOK"
        ):
            break
        for tok in line.strip().split():
            word = tok.strip(string.punctuation).lower()
            sorter[word] = mtbl.varint_encode(1)

    sorter.write(writer)
コード例 #8
0
def main(fname, num_keys):
    writer = mtbl.writer(fname, compression=mtbl.COMPRESSION_SNAPPY)

    a = time.time()
    last = a
    total_bytes = 0
    count = 0
    total = 0
    while count < num_keys:
        count += 1
        if random.random() >= 0.5:
            total += 1
            key = '%010d' % count
            val = random.choice(string.ascii_lowercase) * random.randint(1, 50)
            writer[key] = val
            total_bytes += len(key) + len(val)
        if (count % report_interval) == 0:
            b = time.time()
            last_secs = b - last
            last = b
            sys.stderr.write('wrote %s entries (%s MB) in %s seconds, %s entries/second\n' % (
                locale.format('%d', total, grouping=True),
                locale.format('%d', total_bytes / megabyte, grouping=True),
                locale.format('%f', last_secs, grouping=True),
                locale.format('%d', report_interval / last_secs, grouping=True)
                )
            )
    b = time.time()
    total_secs = b - a
    sys.stderr.write('wrote %s total entries (%s MB) in %s seconds, %s entries/second\n' % (
        locale.format('%d', total, grouping=True),
        locale.format('%d', total_bytes / megabyte, grouping=True),
        locale.format('%f', total_secs, grouping=True),
        locale.format('%d', total / total_secs, grouping=True)
        )
    )
コード例 #9
0
def main(fname, num_keys):
    sorter = mtbl.sorter(merge_func)
    writer = mtbl.writer(fname, compression=mtbl.COMPRESSION_SNAPPY)

    a = time.time()
    last = a
    total_bytes = 0
    count = 0
    while count < num_keys:
        count += 1
        key = '%020d' % random.randint(0, sys.maxint)
        val = random.choice(string.ascii_lowercase) * random.randint(1, 50)
        sorter[key] = val
        total_bytes += len(key) + len(val)
        if (count % report_interval) == 0:
            b = time.time()
            last_secs = b - last
            last = b
            sys.stderr.write('generated %s entries (%s MB) in %s seconds, %s entries/second\n' % (
                locale.format('%d', count, grouping=True),
                locale.format('%d', total_bytes / megabyte, grouping=True),
                locale.format('%f', last_secs, grouping=True),
                locale.format('%d', report_interval / last_secs, grouping=True)
                )
            )
    sys.stderr.write('writing to output file %s\n' % fname)
    sorter.write(writer)
    b = time.time()
    total_secs = b - a
    sys.stderr.write('wrote %s total entries (%s MB) in %s seconds, %s entries/second\n' % (
        locale.format('%d', count, grouping=True),
        locale.format('%d', total_bytes / megabyte, grouping=True),
        locale.format('%f', total_secs, grouping=True),
        locale.format('%d', count / total_secs, grouping=True)
        )
    )