Esempio n. 1
0
def main(txt_fname, mtbl_fname):
    txt = open(txt_fname)
    sorter = mtbl.sorter(merge_func)
    writer = mtbl.writer(mtbl_fname, compression=mtbl.COMPRESSION_SNAPPY)

    # trim header
    while True:
        line = txt.readline()
        if line.startswith('*** START OF THIS PROJECT GUTENBERG EBOOK'):
            break
    for x in range(0, 5):
        txt.readline()

    for line in txt:
        if line.startswith('End of the Project Gutenberg EBook') or \
           line.startswith('*** END OF THIS PROJECT GUTENBERG EBOOK'):
            break
        for tok in line.strip().split():
            word = tok.strip(string.punctuation).lower()
            sorter[word] = mtbl.varint_encode(1)

    sorter.write(writer)
Esempio n. 2
0
def main(txt_fname, mtbl_fname):
    txt = open(txt_fname)
    sorter = mtbl.sorter(merge_func)
    writer = mtbl.writer(mtbl_fname, compression=mtbl.COMPRESSION_SNAPPY)

    # trim header
    while True:
        line = txt.readline()
        if line.startswith("*** START OF THIS PROJECT GUTENBERG EBOOK"):
            break
    for x in range(0, 5):
        txt.readline()

    for line in txt:
        if line.startswith("End of the Project Gutenberg EBook") or line.startswith(
            "*** END OF THIS PROJECT GUTENBERG EBOOK"
        ):
            break
        for tok in line.strip().split():
            word = tok.strip(string.punctuation).lower()
            sorter[word] = mtbl.varint_encode(1)

    sorter.write(writer)
Esempio n. 3
0
def merge_func(key, val0, val1):
    i0 = mtbl.varint_decode(val0)
    i1 = mtbl.varint_decode(val1)
    return mtbl.varint_encode(i0 + i1)
Esempio n. 4
0
def merge_func(key, val0, val1):
    i0 = mtbl.varint_decode(val0)
    i1 = mtbl.varint_decode(val1)
    return mtbl.varint_encode(i0 + i1)