예제 #1
0
def main():
    global verbose

    try:
        opts, args = getopt.getopt(sys.argv[1:],
                                   'hvd:e:',
                                   ['help',
                                    'verbose',
                                    'dir=',
                                    'extract=',
                                    ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    dir = 'image/enpedia'
    extract = None

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-d', '--dir'):
            dir = arg
        elif opt in ('-e', '--extract'):
            extract = arg
        else:
            usage('unhandled option: ' + opt)

    if not os.path.isdir(dir):
        usage('{0:s} is not a directory'.format(dir))

    idx_file = open(os.path.join(dir, "wiki.idx"), "rb")
    fnd_file = SegmentedFileReader(os.path.join(dir, "wiki{0:s}.fnd"))

    dat_format = os.path.join(dir, "wiki{0:d}.dat")

    index_min = 1
    index_max = struct.unpack('<I', idx_file.read(4))[0]

    PrintLog.message('Total index entries = {0:d}'.format(index_max))
    PrintLog.message('')

    for item in args:
        try:
            index_number = int(item.translate(None, ',_'), 0)
        except ValueError:
            usage('"{0:s}" is not numeric'.format(item))

        if index_number < index_min or index_number > index_max:
            usage('index: {0:d} is outdide [{1:d} .. {2:d}]'.format(index_number, index_min, index_max))

        process(index_number, idx_file, fnd_file, dat_format, extract)


    idx_file.close()
    fnd_file.close()
예제 #2
0
def main():
    global verbose

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hvd:e:', [
            'help',
            'verbose',
            'dir=',
            'extract=',
        ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    dir = 'image/enpedia'
    extract = None

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-d', '--dir'):
            dir = arg
        elif opt in ('-e', '--extract'):
            extract = arg
        else:
            usage('unhandled option: ' + opt)

    if not os.path.isdir(dir):
        usage('{0:s} is not a directory'.format(dir))

    idx_file = open(os.path.join(dir, "wiki.idx"), "rb")
    fnd_file = SegmentedFileReader(os.path.join(dir, "wiki{0:s}.fnd"))

    dat_format = os.path.join(dir, "wiki{0:d}.dat")

    index_min = 1
    index_max = struct.unpack('<I', idx_file.read(4))[0]

    PrintLog.message('Total index entries = {0:d}'.format(index_max))
    PrintLog.message('')

    for item in args:
        try:
            index_number = int(item.translate(None, ',_'), 0)
        except ValueError:
            usage('"{0:s}" is not numeric'.format(item))

        if index_number < index_min or index_number > index_max:
            usage('index: {0:d} is outdide [{1:d} .. {2:d}]'.format(
                index_number, index_min, index_max))

        process(index_number, idx_file, fnd_file, dat_format, extract)

    idx_file.close()
    fnd_file.close()
예제 #3
0
def main():
    global verbose

    try:
        opts, args = getopt.getopt(sys.argv[1:],
                                   'hv',
                                   ['help',
                                    'verbose',
                                    ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    uint32_size = 4

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        else:
            usage('unhandled option: ' + opt)

    if len(args) < 1:
        usage('missing arguments')

    fnd_file = SegmentedFileReader(args)

    total_entries = 0

    bigram_table = {}
    for i in range(128,256):
        bigram_table[i] = fnd_file.read(2)

    previous_title1 = ''
    previous_title2 = ''

    while True:
        fnd_offset = fnd_file.tell()
        header = fnd_file.read(uint32_size + 1)

        if 0 == len(header):
            break

        article_number, nul_byte = struct.unpack('<IB', header)

        title1 = get_title(fnd_file)
        title2 = get_title(fnd_file)
        total_entries += 1

        length1 = len(title1)
        length2 = len(title2)

        if 0 != length1 and title1[0] < ' ':
            prefix_length = ord(title1[0]) + 1
            title1 = previous_title1[:prefix_length] + title1[1:]

        if 0 != length2 and title2[0] < ' ':
            prefix_length = ord(title2[0]) + 1
            title2 = previous_title2[:prefix_length] + title2[1:]

        full_length1 = len(title1)
        full_length2 = len(title2)

        decoded_title1 = ''
        for c in title1:
            i = ord(c)
            if i in bigram_table:
                decoded_title1 += bigram_table[i]
            else:
                decoded_title1 += c

        PrintLog.message(u'Index: {an:13n} @ Offset: {of:13n} [0x{of:08x}]\n'
                         u'{pad1:s}[{l1:3d}/{fl1:3d}]:{t1!r:s}\n'
                         u'{pad1:s}{pad2}{dt1!r:s}\n'
                         u'{pad1:s}[{fl1:3d}/{fl2:3d}]:"{t2:s}"\n'
                         .format(of = fnd_offset, an = article_number,
                                 l1 = length1, fl1 = full_length1, t1 = title1, dt1 = decoded_title1,
                                 pad1 = ' ' * 2, pad2 = ' ' * (2 * 3 + 4),
                                 l2 = length2, fl2 = full_length2, t2 = truncated_utf8(title2)))

        previous_title1 = title1
        previous_title2 = title2

    fnd_file.close()

    PrintLog.message(u'Total entries  = {0:13n}'.format(total_entries))
예제 #4
0
def main():
    global verbose

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hv', [
            'help',
            'verbose',
        ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    uint32_size = 4

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        else:
            usage('unhandled option: ' + opt)

    if len(args) < 1:
        usage('missing arguments')

    fnd_file = SegmentedFileReader(args)

    total_entries = 0

    bigram_table = {}
    for i in range(128, 256):
        bigram_table[i] = fnd_file.read(2)

    previous_title1 = ''
    previous_title2 = ''

    while True:
        fnd_offset = fnd_file.tell()
        header = fnd_file.read(uint32_size + 1)

        if 0 == len(header):
            break

        article_number, nul_byte = struct.unpack('<IB', header)

        title1 = get_title(fnd_file)
        title2 = get_title(fnd_file)
        total_entries += 1

        length1 = len(title1)
        length2 = len(title2)

        if 0 != length1 and title1[0] < ' ':
            prefix_length = ord(title1[0]) + 1
            title1 = previous_title1[:prefix_length] + title1[1:]

        if 0 != length2 and title2[0] < ' ':
            prefix_length = ord(title2[0]) + 1
            title2 = previous_title2[:prefix_length] + title2[1:]

        full_length1 = len(title1)
        full_length2 = len(title2)

        decoded_title1 = ''
        for c in title1:
            i = ord(c)
            if i in bigram_table:
                decoded_title1 += bigram_table[i]
            else:
                decoded_title1 += c

        PrintLog.message(u'Index: {an:13n} @ Offset: {of:13n} [0x{of:08x}]\n'
                         u'{pad1:s}[{l1:3d}/{fl1:3d}]:{t1!r:s}\n'
                         u'{pad1:s}{pad2}{dt1!r:s}\n'
                         u'{pad1:s}[{fl1:3d}/{fl2:3d}]:"{t2:s}"\n'.format(
                             of=fnd_offset,
                             an=article_number,
                             l1=length1,
                             fl1=full_length1,
                             t1=title1,
                             dt1=decoded_title1,
                             pad1=' ' * 2,
                             pad2=' ' * (2 * 3 + 4),
                             l2=length2,
                             fl2=full_length2,
                             t2=truncated_utf8(title2)))

        previous_title1 = title1
        previous_title2 = title2

    fnd_file.close()

    PrintLog.message(u'Total entries  = {0:13n}'.format(total_entries))