Example #1
0
    def partition(self, text):
        """private method - simulate zero width spaces for Japanese"""
        l = []
        r = ''
        last_n = ''
        for c in text:
            try:
                n = unicodedata.name(c).split()[0]
            except ValueError:
                n = 'NoName'
                PrintLog.message(u'No unicode name for: "{0:s}"'.format(c))

            if n in self.CJK:
                if '' != r:
                    l.append(r)
                r = c
                last_n = n

            elif last_n in self.CJK:
                if n in self.PUNCTUATION:
                    l.append(r + c)
                    r = ''
                    last_n = ''
                else:
                    l.append(r)
                    r = c
                    last_n = n
            else:
                r += c
                last_n = n

        if '' != r:
            l.append(r)
        return l
Example #2
0
 def handle_entityref(self, name):
     """handle & > ..."""
     try:
         self.handle_data(unichr(htmlentitydefs.name2codepoint[name]))
     except KeyError:
         PrintLog.message(u'ENTITYREF ERROR: {0:s} article: {1:s}'.format(
             name, g_this_article_title))
Example #3
0
 def resolve_redirects(self):
     """add redirect to article_index"""
     global verbose
     count = 0
     if verbose:
         PrintLog.message(u'Resolving redirects')
     else:
         pass
     for item in self.redirects:
         try:
             self.set_index(item, self.find(item)[:3] + (True,))
             count += 1
             if verbose and count % 1000 == 0:
                 PrintLog.message(u'Redirects resolved: {0:d}'.format(count))
             else:
                 pass
         except KeyError:
             PrintLog.message(u'Unresolved redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
         except CycleError:
             PrintLog.message(u'Cyclic redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
     if verbose:
         PrintLog.message(u'Total redirects resolved: {0:d}'.format(count))
     else:
         pass
     return count
Example #4
0
    def title(self, category, key, title, seek):
        if self.KEY_ARTICLE != key:
            if verbose:
                PrintLog.message('Non-article: {0:s}:{1:s}'.format(category,title))
            return False

        return True
Example #5
0
 def resolve_redirects(self):
     """add redirect to article_index"""
     global verbose
     count = 0
     if verbose:
         PrintLog.message(u'Resolving redirects')
     else:
         pass
     for item in self.redirects:
         try:
             self.set_index(item, self.find(item)[:3] + (True, ))
             count += 1
             if verbose and count % 1000 == 0:
                 PrintLog.message(
                     u'Redirects resolved: {0:d}'.format(count))
             else:
                 pass
         except KeyError:
             PrintLog.message(u'Unresolved redirect: {0:s} -> {1:s}'.format(
                 item, self.redirects[item]))
         except CycleError:
             PrintLog.message(u'Cyclic redirect: {0:s} -> {1:s}'.format(
                 item, self.redirects[item]))
     if verbose:
         PrintLog.message(u'Total redirects resolved: {0:d}'.format(count))
     else:
         pass
     return count
Example #6
0
    def title(self, category, key, title, seek):
        if self.KEY_ARTICLE != key:
            if verbose:
                PrintLog.message('Non-article: {0:s}:{1:s}'.format(
                    category, title))
            return False

        return True
Example #7
0
def main():
    global verbose

    try:
        opts, args = getopt.getopt(sys.argv[1:],
                                   'hvd:e:',
                                   ['help',
                                    'verbose',
                                    'dir=',
                                    'extract=',
                                    ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    dir = 'image/enpedia'
    extract = None

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-d', '--dir'):
            dir = arg
        elif opt in ('-e', '--extract'):
            extract = arg
        else:
            usage('unhandled option: ' + opt)

    if not os.path.isdir(dir):
        usage('{0:s} is not a directory'.format(dir))

    idx_file = open(os.path.join(dir, "wiki.idx"), "rb")
    fnd_file = SegmentedFileReader(os.path.join(dir, "wiki{0:s}.fnd"))

    dat_format = os.path.join(dir, "wiki{0:d}.dat")

    index_min = 1
    index_max = struct.unpack('<I', idx_file.read(4))[0]

    PrintLog.message('Total index entries = {0:d}'.format(index_max))
    PrintLog.message('')

    for item in args:
        try:
            index_number = int(item.translate(None, ',_'), 0)
        except ValueError:
            usage('"{0:s}" is not numeric'.format(item))

        if index_number < index_min or index_number > index_max:
            usage('index: {0:d} is outdide [{1:d} .. {2:d}]'.format(index_number, index_min, index_max))

        process(index_number, idx_file, fnd_file, dat_format, extract)


    idx_file.close()
    fnd_file.close()
Example #8
0
def main():
    global verbose

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hvd:e:', [
            'help',
            'verbose',
            'dir=',
            'extract=',
        ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    dir = 'image/enpedia'
    extract = None

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-d', '--dir'):
            dir = arg
        elif opt in ('-e', '--extract'):
            extract = arg
        else:
            usage('unhandled option: ' + opt)

    if not os.path.isdir(dir):
        usage('{0:s} is not a directory'.format(dir))

    idx_file = open(os.path.join(dir, "wiki.idx"), "rb")
    fnd_file = SegmentedFileReader(os.path.join(dir, "wiki{0:s}.fnd"))

    dat_format = os.path.join(dir, "wiki{0:d}.dat")

    index_min = 1
    index_max = struct.unpack('<I', idx_file.read(4))[0]

    PrintLog.message('Total index entries = {0:d}'.format(index_max))
    PrintLog.message('')

    for item in args:
        try:
            index_number = int(item.translate(None, ',_'), 0)
        except ValueError:
            usage('"{0:s}" is not numeric'.format(item))

        if index_number < index_min or index_number > index_max:
            usage('index: {0:d} is outdide [{1:d} .. {2:d}]'.format(
                index_number, index_min, index_max))

        process(index_number, idx_file, fnd_file, dat_format, extract)

    idx_file.close()
    fnd_file.close()
Example #9
0
 def resolve_redirects(self):
     """add redirect to article_index"""
     count = 0
     for item in self.redirects:
         try:
             self.set_index(item, self.find(item)[:3] + (True,))
             count += 1
         except KeyError:
             PrintLog.message(u'Unresolved redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
         except CycleError:
             PrintLog.message(u'Cyclic redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
     return count
Example #10
0
 def resolve_redirects(self):
     """add redirect to article_index"""
     count = 0
     for item in self.redirects:
         try:
             self.set_index(item, self.find(item)[:3] + (True,))
             count += 1
         except KeyError:
             PrintLog.message(u'Unresolved redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
         except CycleError:
             PrintLog.message(u'Cyclic redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
     return count
Example #11
0
def make_link(url, x0, x1, text):
    global g_starty, g_curr_face, g_link_cnt, g_links, g_this_article_title

    if article_index(url):
        try:
            esc_code10(x1 - x0)
            g_links[g_link_cnt] = (x0, g_starty - get_lineheight(g_curr_face),
                                   x1, g_starty, url)
            g_link_cnt = g_link_cnt + 1
        except Exception as err:
            PrintLog.message(
                u'Exception making link {0:s} in article {1:s}: {2:s}'.format(
                    url, g_this_article_title, err.message))
Example #12
0
    def title(self, category, key, title, seek):
        global verbose
        global enable_templates

        if self.KEY_ARTICLE == key:
            return True

        if enable_templates and self.KEY_TEMPLATE == key:
            if verbose:
                PrintLog.message(u'Template Title: {0:s}'.format(unicode(title, 'utf-8')))
            return True

        return False
Example #13
0
    def title(self, category, key, title, seek):
        global verbose
        global enable_templates

        if self.KEY_ARTICLE == key:
            return True

        if enable_templates and self.KEY_TEMPLATE == key:
            if verbose:
                PrintLog.message(u'Template Title: {0:s}'.format(unicode(title, 'utf-8')))
            return True

        return False
Example #14
0
def process_article_text(id, count, title, text, newf):
    global verbose

    if verbose:
        PrintLog.message(u'[PA {0:d}] {1:s}'.format(count, title))

    text = TidyUp.article(text)

    if newf:
        newf.write('{0:d}:'.format(id))
        newf.write(title[1:].encode('utf-8'))  # We pad the title to force the database to import strings
        newf.write('\n__NOTOC__\n')
        newf.write(text.encode('utf-8') + '\n')
        newf.write('***EOF***\n')
Example #15
0
def process_article_text(id, count, title, text, newf):
    global verbose

    if verbose:
        PrintLog.message(u'[PA {0:d}] {1:s}'.format(count, title))

    text = TidyUp.article(text)

    if newf:
        newf.write('{0:d}:'.format(id))
        newf.write(title[1:].encode('utf-8'))  # We pad the title to force the database to import strings
        newf.write('\n__NOTOC__\n')
        newf.write(text.encode('utf-8') + '\n')
        newf.write('***EOF***\n')
Example #16
0
def main():
    global verbose
    global INDEX_ITEM_SIZE
    global UINT32_SIZE

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hvo:f:p:',
                                   ['help', 'verbose', 'output=', 'prefix='])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    in_format = 'pedia{0:d}.idx-tmp'
    out_name = 'pedia.idx'

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-p', '--prefix'):
            in_format = arg + '{0:d}.idx-tmp'
        elif opt in ('-o', '--output'):
            out_name = arg
        else:
            usage('unhandled option: ' + opt)

    out = open(out_name, 'wb')

    article_count = 0
    i = 0
    data = {}
    while True:
        in_name = in_format.format(i)
        if not os.path.isfile(in_name):
            break
        if verbose:
            PrintLog.message('combining: {0:s}'.format(in_name))
        data[i] = open(in_name, 'rb').read()
        article_count += len(data[i]) / INDEX_ITEM_SIZE
        i += 1

    out.write(struct.pack('<I', article_count))

    for j in range(i):
        out.write(data[j])

    out.close()

    PrintLog.message('Combined {0:d} files'.format(i))
Example #17
0
    def body(self, category, key, title, text, seek):
        global verbose
        global error_flag

        title = self.convert(title).strip(u'\u200e\u200f')

        if self.KEY_TEMPLATE == key:
            if title not in self.ignored_templates:
                title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title)
                t_body = TidyUp.template(text)
                self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)',
                                             [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body])
                self.template_count += 1
            return

        restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text)

        self.article_count += 1

        # do closer inspection to see if really restricted
        if restricted:
            (restricted, bad_words) = FilterWords.find_restricted(text)

        if restricted:
            self.restricted_count += 1

        if self.article_count % 10000 == 0:
            start_time = time.time()
            PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
            self.time = start_time

        for t in self.language_processor.translate(title):
            generate_bigram(t)

        if verbose:
            if restricted:
                PrintLog.message(u'Restricted Title: {0:s}'.format(title))
                PrintLog.message(u'  --> {0:s}'.format(bad_words))
            else:
                PrintLog.message(u'Title: {0:s}'.format(title))
                pass

        character_count = len(text)
        self.total_character_count += character_count
        self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count)

        if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder
            PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title))
            error_flag = True
Example #18
0
def main():
    global verbose
    global INDEX_ITEM_SIZE
    global UINT32_SIZE

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hvo:f:p:', ['help', 'verbose', 'output=', 'prefix='])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    in_format = 'pedia{0:d}.idx-tmp'
    out_name = 'pedia.idx'

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-p', '--prefix'):
            in_format = arg + '{0:d}.idx-tmp'
        elif opt in ('-o', '--output'):
            out_name = arg
        else:
            usage('unhandled option: ' + opt)

    out = open(out_name, 'wb')

    article_count = 0
    i = 0
    data = {}
    while True:
        in_name = in_format.format(i)
        if not os.path.isfile(in_name):
            break
        if verbose:
            PrintLog.message('combining: {0:s}'.format(in_name))
        data[i] = open(in_name, 'rb').read()
        article_count += len(data[i]) / INDEX_ITEM_SIZE
        i += 1

    out.write(struct.pack('<I', article_count))

    for j in range(i):
        out.write(data[j])

    out.close()

    PrintLog.message('Combined {0:d} files'.format(i))
Example #19
0
    def body(self, category, key, title, text, seek):
        global verbose
        global error_flag

        title = self.convert(title).strip(u'\u200e\u200f')

        if self.KEY_TEMPLATE == key:
            if title not in self.ignored_templates:
                title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title)
                t_body = TidyUp.template(text)
                self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)',
                                             [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body])
                self.template_count += 1
            return

        restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text)

        self.article_count += 1

        # do closer inspection to see if really restricted
        if restricted:
            (restricted, bad_words) = FilterWords.find_restricted(text)

        if restricted:
            self.restricted_count += 1

        if not verbose and self.article_count % 10000 == 0:
            start_time = time.time()
            PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
            self.time = start_time

        for t in self.language_processor.translate(title):
            generate_bigram(t)

        if verbose:
            if restricted:
                PrintLog.message(u'Restricted Title: {0:s}'.format(title))
                PrintLog.message(u'  --> {0:s}'.format(bad_words))
            else:
                PrintLog.message(u'Title: {0:s}'.format(title))

        character_count = len(text)
        self.total_character_count += character_count
        self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count)

        if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder
            PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title))
            error_flag = True
Example #20
0
def process(index_number, idx_file, fnd_file, dat_format):
    """dump the index and fnd file entries"""

    global verbose
    global sizes
    global distribution
    global dist_list
    global total
    global byte_count

    if verbose:
        PrintLog.message(
            'Index number = {0:10n} 0x{0:08x}'.format(index_number))

    uint32_size = 4
    index_entry_size = 2 * uint32_size + 1

    index_offset = uint32_size + index_entry_size * (index_number - 1)

    idx_file.seek(index_offset)
    offset_dat, offset_fnd, file_id = struct.unpack(
        '<2IB', idx_file.read(index_entry_size))

    data_file_name = dat_format.format(file_id)

    dat_file = open(data_file_name, 'rb')
    dat_file.seek(offset_dat)

    number_of_pages = struct.unpack('B', dat_file.read(1))[0]

    for i in range(0, number_of_pages):
        page_id, page_offset, page_length = struct.unpack(
            '<3I', dat_file.read(12))
        restricted = 'Restricted' if (0 != page_offset & 0x80000000) else ''
        page_offset = page_offset & 0x7fffffff

        if page_id in sizes:
            PrintLog.message('DUP: {0:10n}'.format(page_id))

        sizes[page_id] = page_length
        for d in dist_list:
            if page_length <= d:
                distribution[d] += 1
                byte_count[d] += page_length
                total += 1
                break

    dat_file.close()
Example #21
0
def esc_code14(width, height, data):
    """output bitmap"""
    global g_starty, g_curr_face
    global output

    if 0 == width or 0 == height:
        return

    if len(data) != (width + 7) // 8 * height:
        PrintLog.message(u'Error: Corrupt Image')

    output.write(struct.pack('<BBH', 15, width, height) + data)

    lineh = get_lineheight(g_curr_face)

    if height >= lineh:
        g_starty += height - lineh + 3   # since Eric draws images 3px lower for alignment
Example #22
0
def esc_code14(width, height, data):
    """output bitmap"""
    global g_starty, g_curr_face
    global output

    if 0 == width or 0 == height:
        return

    if len(data) != (width + 7) // 8 * height:
        PrintLog.message(u'Error: Corrupt Image')

    output.write(struct.pack('<BBH', 15, width, height) + data)

    lineh = get_lineheight(g_curr_face)

    if height >= lineh:
        g_starty += height - lineh + 3  # since Eric draws images 3px lower for alignment
Example #23
0
def get_imgdata(imgfile, indent):
    try:
        img = gd.image(imgfile)
    except IOError as e:
        PrintLog.message(
            u'unable to open image file: {0:s} because: {1:s}'.format(
                imgfile, e))
        return (0, 0, r'')

    (width, height) = img.size()
    if width <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
        is_black = lambda x, y: (0, 0, 0) == img.colorComponents(
            img.getPixel((x, y)))
        h_range = range(0, width)
        v_range = range(0, height)
    elif height <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
        is_black = lambda x, y: (0, 0, 0) == img.colorComponents(
            img.getPixel((y, x)))
        v_range = range(0, width)
        h_range = range(height - 1, -1, -1)
        (width, height) = (height, width)
    else:
        PrintLog.message(u'image file: {0:s} is too big'.format(imgfile))
        return (0, 0, r'')

    data = ''
    for v in v_range:
        byte = 0
        bit_count = 8

        for h in h_range:
            if is_black(h, v):
                pixel = 1
            else:
                pixel = 0
            bit_count -= 1
            byte |= pixel << bit_count
            if 0 == bit_count:
                data += struct.pack('<B', byte)
                byte = 0
                bit_count = 8
        if 8 != bit_count:
            data += struct.pack('<B', byte)

    return (width, height, data)
Example #24
0
def process(index_number, idx_file, fnd_file, dat_format):
    """dump the index and fnd file entries"""

    global verbose
    global sizes
    global distribution
    global dist_list
    global total
    global byte_count

    if verbose:
        PrintLog.message('Index number = {0:10n} 0x{0:08x}'.format(index_number))

    uint32_size = 4
    index_entry_size = 2 * uint32_size + 1

    index_offset = uint32_size + index_entry_size * (index_number - 1)

    idx_file.seek(index_offset)
    offset_dat, offset_fnd, file_id = struct.unpack('<2IB', idx_file.read(index_entry_size))

    data_file_name = dat_format.format(file_id)

    dat_file = open(data_file_name, 'rb')
    dat_file.seek(offset_dat)

    number_of_pages = struct.unpack('B', dat_file.read(1))[0]

    for i in range(0, number_of_pages):
        page_id, page_offset, page_length = struct.unpack('<3I', dat_file.read(12))
        restricted = 'Restricted' if (0 != page_offset & 0x80000000) else ''
        page_offset = page_offset & 0x7fffffff

        if page_id in sizes:
            PrintLog.message('DUP: {0:10n}'.format(page_id))

        sizes[page_id] = page_length
        for d in dist_list:
            if page_length <= d:
                distribution[d] += 1
                byte_count[d] += page_length
                total += 1
                break

    dat_file.close()
Example #25
0
def output_pfx(filename):
    """output the pfx matrix"""
    global index_matrix

    PrintLog.message(u'Writing: {0:s}'.format(filename))
    start_time = time.time()
    out_f = open(filename, 'wb')
    list = '\0' + SearchKey.all_characters()
    for k1 in list:
        for k2 in list:
            for k3 in list:
                key = k1+k2+k3
                if key in index_matrix:
                    offset = index_matrix[key]
                else:
                    offset = 0
                out_f.write(struct.pack('<I', offset))

    out_f.close()
    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
Example #26
0
def output_pfx(filename):
    """output the pfx matrix"""
    global index_matrix

    PrintLog.message(u'Writing: {0:s}'.format(filename))
    start_time = time.time()
    out_f = open(filename, 'wb')
    list = '\0' + SearchKey.all_characters()
    for k1 in list:
        for k2 in list:
            for k3 in list:
                key = k1+k2+k3
                if key in index_matrix:
                    offset = index_matrix[key]
                else:
                    offset = 0
                out_f.write(struct.pack('<I', offset))

    out_f.close()
    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
Example #27
0
def get_imgdata(imgfile, indent):
    try:
        img = gd.image(imgfile)
    except IOError as e:
        PrintLog.message(u'unable to open image file: {0:s} because: {1:s}'.format(imgfile, e))
        return (0, 0, r'')

    (width, height) = img.size()
    if width <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
        is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((x, y)))
        h_range = range(0, width)
        v_range = range(0, height)
    elif height <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
        is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((y, x)))
        v_range = range(0, width)
        h_range = range(height - 1, -1, -1)
        (width, height) = (height, width)
    else:
        PrintLog.message(u'image file: {0:s} is too big'.format(imgfile))
        return (0, 0, r'')

    data = ''
    for v in v_range:
        byte = 0
        bit_count = 8

        for h in h_range:
            if is_black(h, v):
                pixel = 1
            else:
                pixel = 0
            bit_count -= 1
            byte |= pixel << bit_count
            if 0 == bit_count:
                data += struct.pack('<B', byte)
                byte = 0
                bit_count = 8
        if 8 != bit_count:
            data += struct.pack('<B', byte)

    return (width, height, data)
Example #28
0
    def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
        global whitespaces
        global verbose

        title = self.translate(title).strip(u'\u200e\u200f')

        rtitle = self.translate(rtitle).strip().strip(u'\u200e\u200f')
        rtitle = whitespaces.sub(' ', rtitle).strip().lstrip(':')

        if self.KEY_TEMPLATE == key:
            if title != rtitle:
                title = unicode(category, 'utf-8') + ':' + title.lower()
                rtitle = unicode(rcategory, 'utf-8') + ':' + rtitle.lower()
                self.template_cursor.execute(u'insert or replace into redirects (title, redirect) values(?, ?)',
                                             [u'~{0:d}~{1:s}'.format(self.file_id(), title),
                                              u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)])

            self.template_redirect_count += 1
            return

        if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey:
            if verbose:
                PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                                 .format(category, key, title, rcategory, rkey, rtitle))
            return

        if '' == rtitle:
            PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(category, key, title))
        else:
            self.redirects[title] = rtitle
            self.redirect_count += 1
            if verbose:
                PrintLog.message(u'Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                                 .format(category, key, title, rcategory, rkey, rtitle))
Example #29
0
    def handle_charref(self, name):
        """handle &#DDDD; &#xXXXX;"""

        if 0 == len(name):
            return

        if 'x' == name[0] or 'X' == name[0]:
            try:
                value = int(name[1:], 16)
            except ValueError:
                PrintLog.message(u'charref: "{0:s}" is not hexadecimal'.format(name))
                return

        elif name.isdigit():
            try:
                value = int(name)
            except ValueError:
                PrintLog.message(u'charref: "{0:s}" is not decimal'.format(name))
                return

        try:
            c = unichr(value)
        except ValueError:
            PrintLog.message(u'charref: "{0:d}" is not convertible to unicode'.format(value))
            c = '?'
        self.handle_data(c)
Example #30
0
    def handle_charref(self, name):
        """handle &#DDDD; &#xXXXX;"""

        if 0 == len(name):
            return

        if 'x' == name[0] or 'X' == name[0]:
            try:
                value = int(name[1:], 16)
            except ValueError:
                PrintLog.message(
                    u'charref: "{0:s}" is not hexadecimal'.format(name))
                return

        elif name.isdigit():
            try:
                value = int(name)
            except ValueError:
                PrintLog.message(
                    u'charref: "{0:s}" is not decimal'.format(name))
                return

        try:
            c = unichr(value)
        except ValueError:
            PrintLog.message(
                u'charref: "{0:d}" is not convertible to unicode'.format(
                    value))
            c = '?'
        self.handle_data(c)
Example #31
0
    def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
        global verbose

        title = self.convert(title).strip(u'\u200e\u200f')

        rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f')

        # redirected title may contain '%xx' items - treat as unicode sequence
        # if it fails just keep the %xx sequences intact since it must represent
        # either real %xx or some unknowable coding scheme
        try:
            rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')),
                             'utf-8').strip().strip(u'\u200e\u200f')
        except UnicodeDecodeError:
            pass

        rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip()

        if self.KEY_TEMPLATE == key:
            if title != rtitle:
                title = unicode(
                    category,
                    'utf-8').capitalize() + ':' + upper_case_first_char(title)
                rtitle = unicode(
                    rcategory,
                    'utf-8').capitalize() + ':' + upper_case_first_char(rtitle)
                self.template_cursor.execute(
                    u'insert or replace into redirects (title, redirect) values(?, ?)',
                    [
                        u'~{0:d}~{1:s}'.format(self.file_id(), title),
                        u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)
                    ])

            self.template_redirect_count += 1
            return

        if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey:
            if verbose:
                PrintLog.message(
                    u'Non-article Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                    .format(unicode(category, 'utf-8'), key, title,
                            unicode(rcategory, 'utf-8'), rkey, rtitle))
            return

        if '' == rtitle:
            PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(
                category, key, title))
        else:
            self.redirects[title] = rtitle
            self.redirect_count += 1

            for t in self.language_processor.translate(title):
                generate_bigram(t)

            if verbose:
                PrintLog.message(
                    u'Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'.
                    format(category, key, title, rcategory, rkey, rtitle))
Example #32
0
def write_article_index(file_offset, length):
    global verbose
    global output, f_out, i_out
    global g_this_article_title
    global file_number

    try:
        (article_number, fnd_offset, restricted) = article_index(g_this_article_title)
        data_offset = (file_offset & 0x7fffffff)

        if bool(int(restricted)):  # '0' is True so turn it into False
            data_offset |= 0x80000000
        data_length =  (0x80 << 24) | (file_number << 24) | length  # 0x80 => lzma encoding
        i_out.write(struct.pack('III', data_offset, fnd_offset, data_length))
        i_out.flush()
    except KeyError:
        PrintLog.message(u'Error in: write_article, Title not found')
        PrintLog.message(u'Title:  {0:s}'.format(g_this_article_title))
        PrintLog.message(u'Offset: {0:s}'.format(file_offset))
        PrintLog.message(u'Count:  {0:s}'.format(article_count))
Example #33
0
    def body(self, category, key, title, text, seek):
        global verbose, show_restricted

        restricted_title = FilterWords.is_restricted(title)
        restricted_text = FilterWords.is_restricted(text)
        restricted = restricted_title or restricted_text

        self.article_count += 1
        if restricted:
            self.restricted_count += 1

        if not verbose and self.article_count % 10000 == 0:
            start_time = time.time()
            PrintLog.message('{0:7.2f}s {1:10d}'.format(
                start_time - self.time, self.article_count))
            self.time = start_time

        if verbose:
            PrintLog.message('Title: {0:s}'.format(title))

        if restricted:
            if restricted_title:
                t_state = ' Title'
            else:
                t_state = ''

            if restricted_text:
                b_state = ' Text'
                (flag, contains) = FilterWords.find_restricted(text)
                if not flag:
                    self.unrestricted_count += 1
            else:
                b_state = ''
                contains = None
            if show_restricted:
                PrintLog.message('{0:10d} Restricted{1:s}{2:s}: {3:s}'.format(
                    self.restricted_count, t_state, b_state, title))
                if None != contains:
                    PrintLog.message('        -> {0!s:s} {1:s}'.format(
                        flag, contains))
Example #34
0
    def body(self, category, key, title, text, seek):
        global verbose, show_restricted

        restricted_title =  FilterWords.is_restricted(title)
        restricted_text =  FilterWords.is_restricted(text)
        restricted = restricted_title or restricted_text

        self.article_count += 1
        if restricted:
            self.restricted_count += 1

        if not verbose and self.article_count % 10000 == 0:
            start_time = time.time()
            PrintLog.message('{0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
            self.time = start_time

        if verbose:
            PrintLog.message('Title: {0:s}'.format(title))

        if restricted:
            if restricted_title:
                t_state = ' Title'
            else:
                t_state = ''

            if restricted_text:
                b_state = ' Text'
                (flag, contains) = FilterWords.find_restricted(text)
                if not flag:
                    self.unrestricted_count += 1
            else:
                b_state = ''
                contains = None
            if show_restricted:
                PrintLog.message('{0:10d} Restricted{1:s}{2:s}: {3:s}'
                                 .format(self.restricted_count, t_state, b_state, title))
                if None != contains:
                    PrintLog.message('        -> {0!s:s} {1:s}'.format(flag, contains))
Example #35
0
    def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
        global verbose

        title = self.convert(title).strip(u'\u200e\u200f')

        rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f')

        # redirected title may contain '%xx' items - treat as unicode sequence
        # if it fails just keep the %xx sequences intact since it must represent
        # either real %xx or some unknowable coding scheme
        try:
            rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')), 'utf-8').strip().strip(u'\u200e\u200f')
        except UnicodeDecodeError:
            pass

        rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip()

        if self.KEY_TEMPLATE == key:
            if title != rtitle:
                title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title)
                rtitle = unicode(rcategory, 'utf-8').capitalize() + ':' + upper_case_first_char(rtitle)
                self.template_cursor.execute(u'insert or replace into redirects (title, redirect) values(?, ?)',
                                             [u'~{0:d}~{1:s}'.format(self.file_id(), title),
                                              u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)])

            self.template_redirect_count += 1
            return

        if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey:
            if verbose:
                PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                                 .format(unicode(category, 'utf-8'), key, title,
                                         unicode(rcategory, 'utf-8'), rkey, rtitle))
            return

        if '' == rtitle:
            PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(category, key, title))
        else:
            self.redirects[title] = rtitle
            self.redirect_count += 1

            for t in self.language_processor.translate(title):
                generate_bigram(t)

            if verbose:
                PrintLog.message(u'Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                                 .format(category, key, title, rcategory, rkey, rtitle))
Example #36
0
            break
        (file_id, title, seek, length) = row

        if file_id != current_file_id:
            current_file_id = file_id
            if input_file:
                input_file.close()
            offset_cursor.execute('select filename from files where file_id = ? limit 1', (file_id,))
            filename = offset_cursor.fetchone()[0]
            input_file = open(filename, 'rb')
            if not input_file:
                PrintlogLog.message('Failed to open: {0:s}'.format(filename))
                current_file_id = None
                continue
            if verbose:
                PrintLog.message(u'Opened: {0:s}'.format(filename))

        try:
            input_file.seek(seek)
        except Exception, e:
            PrintLog.message(u'seek failed: e={0:!s:s}  seek={1:d}  f={2:s}'.format(e, seek, filename))
            sys.exit(1)

        # restart the background process if it fails to try to record all failing articles
        if None != background_process and None == process_id:
            process_id = subprocess.Popen(background_process, shell=True, stdin=subprocess.PIPE)

        try:
            process_article_text(current_file_id, total_articles + 1, title,
                                 input_file.read(length), process_id.stdin)
        except Exception, e:
Example #37
0
def main():
    """ main processing"""

    global verbose

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hvi:o:',
                                   ['help', 'verbose',
                                    'input=',
                                    'output=',
                                    ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    input_file_name = 'pinyin_table.txt'
    output_file_name = 'PinyinTable.py'

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-i', '--input'):
            input_file_name = arg
        elif opt in ('-o', '--output'):
            output_file_name = arg
        else:
            usage('unhandled option: ' + opt)

    if [] != args:
        usage('Extraneous argument(s)')


    PrintLog.message(u'Reading Data File: {0:s}'.format(input_file_name))

    errors = False
    pinyin = {}

    with open(input_file_name, 'rb') as f:
        PrintLog.message(u'File Header: {0:s}'.format(f.readline().strip()))
        PrintLog.message(u'File Version: {0:s}'.format(f.readline().strip()))
        expected_lines = int(f.readline())
        line_count = 0
        char_count = 0
        for line in f:
            line_count += 1
            n = line.strip().split()
            phonetic = make_pinyin(n.pop(0))
            item_count = int(n.pop(0))
            if len(n) != item_count:
                PrintLog.message(u'Error: incorrect item count, expected: {0:d} got: {1:d}'.format(item_count, len(n)))
                errors = True
                break
            for s in n:
                cjk = unicode(s, 'utf-8')[0]
                if cjk in pinyin:
                    pinyin[cjk] += [phonetic]
                else:
                    pinyin[cjk] = [phonetic]
                char_count += 1
        if line_count == expected_lines:
            PrintLog.message(u'Counted CJK glyphs: {0:d}'.format(char_count))
            PrintLog.message(u'Expected Lines: {0:d}'.format(expected_lines))
            PrintLog.message(u'Counted Lines: {0:d}'.format(line_count))
        else:
            PrintLog.message(u'Error: linecount miosmatch: {0:d} != {1:d}'.format(expected_lines, line_count))
            errors = True

    if errors:
        PrintLog.message(u'Error: failed to read data file')
        return 1
    else:
        PrintLog.message(u'Data Read Completed Sucessfully')

    text = u'欧洲,软件+互联网[用统一码]  歐洲,軟體及網際網路[讓統一碼] ABC 西安 先'
    expected = u'ōuzhōu,ruănjiàn+hùliánwăng[yòngtŏngyīmă]  ōuzhōu,ruăntĭjíwăngjìwănglù[ràngtŏngyīmă] ABC xīān xiān'

    result = u''
    for c in text:
        if c in pinyin:
            result += pinyin[c][0]
        else:
            result += c

    if result == expected:
        PrintLog.message(u'Creating: {0:s}'.format(output_file_name))

        generate_output(output_file_name, 6, pinyin)

        PrintLog.message(u'Finished: {0:s}'.format(output_file_name))

    else:
        PrintLog.message(u'Error in test:')
        PrintLog.message(u'input:    {0:s}'.format(text))
        PrintLog.message(u'output:   {0:s}'.format(result))
        PrintLog.message(u'expected: {0:s}'.format(expected))
        return 2

    return 0
Example #38
0
 def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
     self.redirect_count += 1
     if verbose:
         PrintLog.message('Redirect: {0:s}:{1:s} -> {2:s}:{3:s}'.format(category, title, rcategory, rtitle))
Example #39
0
def main():
    global verbose
    global error_flag


    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hvi:o:c:t:I:l:p:L:T',
                                   ['help', 'verbose',
                                    'article-index=',
                                    'article-offsets=',
                                    'article-counts=',
                                    'templates=',
                                    'ignore-templates=',
                                    'limit=',
                                    'prefix=',
                                    'language=',
                                    'truncate-title',
                                    ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    art_name = "articles.db"
    off_name = "offsets.db"
    cnt_name = "counts.text"
    fnd_name = 'pedia{0:s}.fnd'
    pfx_name = 'pedia.pfx'
    template_name = 'templates.db'
    ignore_templates_name = None
    limit = 'all'
    language = 'en'             # some languages may require special processing
    truncate_title = False      # set tru when not using language links

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-i', '--article-index'):
            art_name = arg
        elif opt in ('-o', '--article-offsets'):
            off_name = arg
        elif opt in ('-c', '--article-counts'):
            cnt_name = arg
        elif opt in ('-t', '--templates'):
            template_name = arg
        elif opt in ('-I', '--ignore-templates'):
            ignore_templates_name = arg
            if not os.path.exists(ignore_templates_name):
                usage(u'ignore-templates file: {0:s} does not exist'.format(ignore_templates_name))
        elif opt in ('-T', '--truncate-title'):
            truncate_title = True
        elif opt in ('-l', '--limit'):
            if arg[-1] == 'k':
                arg = arg[:-1] + '000'
            if arg != 'all':
                try:
                    limit = int(arg)
                except ValueError:
                    usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
            if limit <= 0:
                usage('"{0:s}={1:s}" must be > zero'.format(opt, arg))
        elif opt in ('-p', '--prefix'):
            fnd_name = arg + '{0:s}.fnd'
            pfx_name = arg + '.pfx'
        elif opt in ('-L', '--language'):
            language = arg
        else:
            usage('unhandled option: ' + opt)

    if [] == args:
        usage('Missing argument(s)')

    ignored_templates = {}
    if None != ignore_templates_name:
        with open(ignore_templates_name) as f:
            for l in f.readlines():
                line = unicode(l, 'utf-8').strip()
                if line.startswith('#'):
                    continue
                if '' != line:
                    ignored_templates[line] = True

    language_convert = LanguageTranslation.LanguageNormal()
    if 'ja' == language:
        language_convert = LanguageTranslation.LanguageJapanese()

    processor = FileProcessing(articles = art_name, offsets = off_name,
                               templates = template_name,
                               ignored_templates = ignored_templates,
                               language = language_convert)

    for f in args:
        limit = processor.process(f, limit)
        if limit != 'all' and limit <= 0:
            break

    # record initial counts
    a = processor.article_count
    r = processor.redirect_count

    # fix up redirects
    m = a + processor.resolve_redirects()

    # record combined count and display statistics
    s = a + r

    cf = open(cnt_name, 'w')

    for f in (sys.stdout, cf):
        f.write('Articles:   {0:10d}\n'.format(a))
        f.write('Redirects:  {0:10d}\n'.format(r))
        f.write('Sum:        {0:10d}\n'.format(s))
        f.write('Merged:     {0:10d}\n'.format(m))
        f.write('Difference: {0:10d}\n'.format(m - s))

        f.write('Restricted: {0:10d}\n'.format(processor.restricted_count))

        f.write('Templates:  {0:10d}\n'.format(processor.template_count))
        f.write('rTemplates: {0:10d}\n'.format(processor.template_redirect_count))

        f.write('Characters: {0:10d}\n'.format(processor.total_character_count))

    cf.close()

    output_fnd(fnd_name, processor, language_convert, truncate_title)
    output_pfx(pfx_name)
    del processor

    # return non-zero status if there have been any errors
    if error_flag:
        PrintLog.message('*** ERROR in Index build')
        PrintLog.message('***   Currently "Duplicate Title" is the only condition that causes this error')
        PrintLog.message('***   Likely "license.xml" or "terms.xml" file duplicates a title in main wiki file')
        PrintLog.message('***   Manually edit "license.xml" or "terms.xml" file to change the title')
        sys.exit(1)
Example #40
0
    def handle_endtag(self, tag):
        global g_this_article_title
        global article_count
        global warnings

        # ignore end tag without start tag
        if (tag, True) not in self.tag_stack and (tag, False) not in self.tag_stack:
            if warnings:
                (line, column) = self.getpos()
                PrintLog.message(u'Warning: superfluous </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                 .format(tag, line, column, article_count + 1, g_this_article_title))
            return

        # backtrack up the stack closing each open tag until there is a match
        (start_tag, self.printing) = self.tag_stack.pop()
        while start_tag != tag:
            self.tag_stack.append((start_tag, self.printing))
            if warnings:
                (line, column) = self.getpos()
                PrintLog.message(u'Warning: force </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                 .format(start_tag, line, column, article_count + 1, g_this_article_title))
            self.handle_endtag(start_tag)
            (start_tag, self.printing) = self.tag_stack.pop()

        # must always do </html> tag
        if tag == 'html':
            self.printing = True
            self.tag_stack = []
            self.in_html = False
            esc_code1()
            write_article(self.language_links)
            return

        if not self.printing:
            return

        elif tag == 'script':
            pass

        elif tag == 'title':
            self.in_title = False
            g_this_article_title = g_this_article_title.strip()

        elif tag == 'body':
            self.in_body = False
            self.flush_buffer()

        elif tag == 'table':
            if self.in_table > 0:
                self.in_table -= 1

        # if in a table suppress everything after this point
        if self.in_table > 0:
            return

        elif tag == 'h1':
            self.flush_buffer()
            self.in_h1 = False
            esc_code0(H1_MARGIN_BOTTOM)

        elif tag == 'h2':
            self.flush_buffer()
            self.in_h2 = False

        elif tag == 'h3':
            self.flush_buffer()
            self.in_h3 = False

        elif tag == 'h4':
            self.flush_buffer()
            self.in_h4 = False

        elif tag == 'h5':
            self.flush_buffer()
            self.in_h5 = False

        elif tag == 'h6':
            self.flush_buffer()
            self.in_h6 = False

        elif tag == 'div':
            self.flush_buffer()

        elif tag == 'p':
            self.flush_buffer()
            self.in_p = False

        elif tag == 'blockquote':
            self.flush_buffer()
            if self.quote > 0:
                if self.quote < MAX_QUOTE_LEVEL:
                    self.indent -= BLOCKQUOTE_MARGIN_LEFT
                    self.lwidth += BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
                    esc_code9(-BLOCKQUOTE_MARGIN_LEFT)
                self.quote -= 1

        elif tag == 'b':
            self.in_b = False

        elif tag == 'big':
            self.in_b = False

        elif tag == 'strong':
            self.in_b = False

        elif tag == 'i':
            self.in_i = False

        elif tag == 'del':
            self.in_del = False

        elif tag == 'ins':
            self.in_ins = False

        elif tag == 'a':
            self.in_a = False
            self.url  = ""

        elif tag in ['ul', 'ol', 'dl']:
            self.leave_list()

        elif tag == 'li':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: stray </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
            else:
                self.flush_buffer(False)
                self.list_decrease_indent()
                self.li_inside[self.level] = False

        elif tag == 'dd':
            self.flush_buffer()
            self.list_decrease_indent()

        elif tag == 'dt':
            self.flush_buffer()

        elif tag == 'br':
            self.flush_buffer()
            self.in_br = False

        elif tag == 'img':
            self.in_img = False
Example #41
0
def main():
    global verbose
    global PARSER_COMMAND
    global total_articles

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hvx:s:c:o:t:l:V:jnw:T:',
                                   ['help', 'verbose', 'xhtml=',
                                    'start=', 'count=',
                                    'article-offsets=',
                                    'templates=',
                                    'language=',
                                    'language-variant=',
                                    'just-cat',
                                    'no-output',
                                    'parser-workdir=',
                                    'parser-tempdir=',
                                    ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    out_name = 'all_articles.html'
    off_name = 'offsets.db'
    parser_workdir = '/tmp'
    parser_tempdir = os.path.join(parser_workdir, 'tmp')
    start_article = 1
    article_count = 'all'
    failed_articles = 0
    do_output = True
    template_name = 'templates.db'
    language = 'en'
    language_variant = ''

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-x', '--xhtml'):
            out_name = arg
        elif opt in ('-o', '--article-offsets'):
            off_name = arg
        elif opt in ('-t', '--templates'):
            template_name = arg
        elif opt in ('-l', '--language'):
            language = arg
        elif opt in ('-V', '--language-variant'):
            language_variant = arg
        elif opt in ('-w', '--parser-workdir'):
            parser_workdir = arg
        elif opt in ('-T', '--parser-tempdir'):
            parser_tempdir = arg
        elif opt in ('-j', '--just-cat'):
            PARSER_COMMAND = 'cat'
        elif opt in ('-n', '--no-output'):
            do_output = False
        elif opt in ('-s', '--start'):
            if arg[-1] == 'k':
                arg = arg[:-1] + '000'
            try:
                start_article = int(arg)
            except ValueError:
                usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
            if start_article < 1:
                usage('"{0:s}={1:s}" must be >= 1'.format(opt, arg))
        elif opt in ('-c', '--count'):
            if arg[-1] == 'k':
                arg = arg[:-1] + '000'
            if arg != 'all':
                try:
                    article_count = int(arg)
                except ValueError:
                    usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
            if article_count <= 0:
                usage('"{0:s}={1:s}" must be > zero'.format(opt, arg))
        else:
            usage('unhandled option: ' + opt)

    if not os.path.isdir(parser_workdir):
        usage('workdir: {0:s} does not exist'.format(parser_workdir))

    if not os.path.isdir(parser_tempdir):
        usage('tempdir: {0:s} does not exist'.format(parser_tempdir))

    # pass parameters to the PHP parser
    os.environ['WORKDIR'] = parser_workdir
    os.environ['TEMPDIR'] = parser_tempdir
    os.environ['LANGUAGE'] = language.lower()
    os.environ['LANGUAGE_VARIANT'] = language_variant.lower().replace('_', '-') # e.g. zh_TW -> zh-tw
    os.environ['TEMPLATE_DB'] = template_name

    offset_db = sqlite3.connect(off_name)
    offset_db.execute('pragma synchronous = 0')
    offset_db.execute('pragma temp_store = 2')
    offset_db.execute('pragma read_uncommitted = true')
    offset_db.execute('pragma cache_size = 20000000')
    offset_db.execute('pragma default_cache_size = 20000000')
    offset_db.execute('pragma journal_mode = off')

    offset_cursor = offset_db.cursor()

    if do_output:
        background_process = PARSER_COMMAND + ' > ' + out_name
    else:
        background_process = None

    # process all required articles
    out_base_name = os.path.basename(out_name) # for logging messages
    current_file_id = None
    input_file = None
    process_id = None
    total_articles = 0
    start_time = time.time()
    while article_count == 'all' or article_count != 0:
        offset_cursor.execute('select file_id, title, seek, length from offsets where article_number = ? limit 1',
                              (start_article,))
        row = offset_cursor.fetchone()
        if None == row:
            break
        (file_id, title, seek, length) = row

        if file_id != current_file_id:
            current_file_id = file_id
            if input_file:
                input_file.close()
            offset_cursor.execute('select filename from files where file_id = ? limit 1', (file_id,))
            filename = offset_cursor.fetchone()[0]
            input_file = open(filename, 'rb')
            if not input_file:
                PrintLog.message('Failed to open: {0:s}'.format(filename))
                current_file_id = None
                continue
            if verbose:
                PrintLog.message(u'Opened: {0:s}'.format(filename))

        try:
            input_file.seek(seek)
        except Exception as e:
            PrintLog.message(u'seek failed: e={0:!s:s}  seek={1:d}  f={2:s}'.format(e, seek, filename))
            sys.exit(1)

        # restart the background process if it fails to try to record all failing articles
        if None != background_process and None == process_id:
            process_id = subprocess.Popen(background_process, shell=True, stdin=subprocess.PIPE)

        try:
            process_article_text(current_file_id, total_articles + 1, title,
                                 input_file.read(length), process_id.stdin)
        except Exception as e:
            failed_articles += 1
            # extract from log by: grep '^!' log-file
            PrintLog.message(u'!Process failed, file: {0:s} article({1:d}): {2:s} because: {3!s:s}'
                             .format(filename, total_articles, title, e))
            trace = sys.exc_info()
            if None != trace:
                traceback.print_tb(trace[2])
            process_id.stdin.close()
            process_id.wait()
            process_id = None

        if article_count != 'all':
            article_count -= 1
        total_articles += 1
        start_article += 1
        if not verbose and total_articles % 1000 == 0:
            if 0 != failed_articles:
                failed_message = 'Failed: {0:d}'.format(failed_articles)
            else:
                failed_message = ''
            now_time = time.time()
            PrintLog.message(u'Parse[{0:s}]: {1:7.2f}s {2:10d}  {3:s}'
                             .format(out_base_name, now_time - start_time,
                              total_articles, failed_message))
            start_time = now_time

    # close files
    if input_file:
        input_file.close()

    # wait for background process to finish
    if process_id:
        process_id.stdin.close()
        process_id.wait()

    # output some statistics and create count file
    PrintLog.message(u'Parse[{0:s}]: Total:  {1:d}'.format(out_base_name, total_articles))

    # write the total count for Rendering program
    fd = open(out_name + '.count', 'wb')
    if fd is not None:
        fd.write('TOTAL_ARTICLES = {count:d}\n'.format(count = total_articles))
        fd.close()

    # indicate failures
    if 0 != failed_articles:
        PrintLog.message(u'Parse[{0:s}]: Failed: {1:d}'.format(out_base_name, failed_articles))
        sys.exit(1)
Example #42
0
def output_fnd(filename, article_index, language_processor, truncate_title):
    """create bigram table"""
    global bigram
    global index_matrix
    global MAXIMUM_TITLE_LENGTH
    global MAXIMUM_TITLE_ACTUAL

    PrintLog.message(u'Writing bigrams: {0:s}'.format(filename))
    start_time = time.time()
    out_f = open(filename, 'wb')

    sortedgram = [ (value, key) for key, value in bigram.iteritems() ]
    sortedgram.sort()
    sortedgram.reverse()

    bigram = {}
    i = 0
    for k, v in sortedgram:
        out_f.write(v)
        bigram[v] = chr(i + 128)
        i += 1
        if i >= 128:
            break
    while i < 128:
        out_f.write('zz')
        bigram['zz'] = chr(i + 128)
        i += 1

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    # create pfx matrix and write encoded titles

    #article_list = [strip_accents(k) for k in article_index.keys()]
    #article_list.sort(key = lambda x: strip_accents(x).lower())

    PrintLog.message(u'Sorting titles')
    start_time = time.time()

    article_list = [ (SearchKey.make_key(language_processor.translate(title)), title)
                      for title in article_index.all_indices() ]
    article_list.sort()

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    PrintLog.message(u'Writing matrix: {0:s}'.format(filename))
    start_time = time.time()

    index_matrix = {}
    index_matrix['\0\0\0'] = out_f.tell()

    previous_bigram_title = ''
    previous_utf8_title = ''
    mod_counter = 0

    for stripped_title, title in article_list:

        bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH]
        (article_number, dummy, restricted, is_redirect) = article_index.get_index(title)

        if '' == bigram_title and is_redirect:
            continue

        utf8_title = title.encode('utf-8')
        if truncate_title:
            utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH]
        else:
            utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL]

        offset = out_f.tell()
        article_index.set_index(title, (article_number, offset, restricted, is_redirect))

        key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower()
        key2 = key3[0:2] + '\0'
        key1 = key3[0:1] + '\0\0'
        if key1 not in index_matrix:
            index_matrix[key1] = offset
        if key2 not in index_matrix:
            index_matrix[key2] = offset
        if key3 not in index_matrix:
            index_matrix[key3] = offset

        if 0 == mod_counter & 0x0f:
            bigram_common_length = 0
            utf8_common_length = 0
        else:
            bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title)
            utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title)
        mod_counter += 1

        previous_bigram_title = bigram_title
        previous_utf8_title = utf8_title

        if bigram_common_length > 1:
            bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:]
        if utf8_common_length > 1:
            utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:]

        out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0')

    out_f.close()
    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
Example #43
0
def main():
    global verbose

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hv', [
            'help',
            'verbose',
        ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    uint32_size = 4

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        else:
            usage('unhandled option: ' + opt)

    if len(args) < 1:
        usage('missing arguments')

    fnd_file = SegmentedFileReader(args)

    total_entries = 0

    bigram_table = {}
    for i in range(128, 256):
        bigram_table[i] = fnd_file.read(2)

    previous_title1 = ''
    previous_title2 = ''

    while True:
        fnd_offset = fnd_file.tell()
        header = fnd_file.read(uint32_size + 1)

        if 0 == len(header):
            break

        article_number, nul_byte = struct.unpack('<IB', header)

        title1 = get_title(fnd_file)
        title2 = get_title(fnd_file)
        total_entries += 1

        length1 = len(title1)
        length2 = len(title2)

        if 0 != length1 and title1[0] < ' ':
            prefix_length = ord(title1[0]) + 1
            title1 = previous_title1[:prefix_length] + title1[1:]

        if 0 != length2 and title2[0] < ' ':
            prefix_length = ord(title2[0]) + 1
            title2 = previous_title2[:prefix_length] + title2[1:]

        full_length1 = len(title1)
        full_length2 = len(title2)

        decoded_title1 = ''
        for c in title1:
            i = ord(c)
            if i in bigram_table:
                decoded_title1 += bigram_table[i]
            else:
                decoded_title1 += c

        PrintLog.message(u'Index: {an:13n} @ Offset: {of:13n} [0x{of:08x}]\n'
                         u'{pad1:s}[{l1:3d}/{fl1:3d}]:{t1!r:s}\n'
                         u'{pad1:s}{pad2}{dt1!r:s}\n'
                         u'{pad1:s}[{fl1:3d}/{fl2:3d}]:"{t2:s}"\n'.format(
                             of=fnd_offset,
                             an=article_number,
                             l1=length1,
                             fl1=full_length1,
                             t1=title1,
                             dt1=decoded_title1,
                             pad1=' ' * 2,
                             pad2=' ' * (2 * 3 + 4),
                             l2=length2,
                             fl2=full_length2,
                             t2=truncated_utf8(title2)))

        previous_title1 = title1
        previous_title2 = title2

    fnd_file.close()

    PrintLog.message(u'Total entries  = {0:13n}'.format(total_entries))
Example #44
0
        f.write('Restricted: {0:10d}\n'.format(processor.restricted_count))

        f.write('Templates:  {0:10d}\n'.format(processor.template_count))
        f.write('rTemplates: {0:10d}\n'.format(processor.template_redirect_count))

        f.write('Characters: {0:10d}\n'.format(processor.total_character_count))

    cf.close()

    output_fnd(fnd_name, processor, language_convert, truncate_title)
    output_pfx(pfx_name)
    del processor

    # return non-zero status if there have been any errors
    if error_flag:
        PrintLog.message('*** ERROR in Index build')
        PrintLog.message('***   Currently "Duplicate Title" is the only condition that causes this error')
        PrintLog.message('***   Likely "license.xml" or "terms.xml" file duplicates a title in main wiki file')
        PrintLog.message('***   Manually edit "license.xml" or "terms.xml" file to change the title')
        sys.exit(1)


def generate_bigram(text):
    """create bigram from pairs of characters"""
    global bigram

    if len(text) > 2:
        try:
            if SearchKey.is_valid_character(text[0]) and SearchKey.is_valid_character(text[1]):
                bigram[text[0:2]] += 1
        except KeyError:
Example #45
0
 def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
     self.redirect_count += 1
     if verbose:
         PrintLog.message('Redirect: {0:s}:{1:s} -> {2:s}:{3:s}'.format(
             category, title, rcategory, rtitle))
Example #46
0
def write_article(language_links):
    global compress
    global verbose
    global output, f_out, i_out
    global article_count
    global g_this_article_title
    global file_number
    global start_time
    global article_writer

    article_count += 1
    if verbose:
        PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count,
                                                     g_this_article_title))

    elif article_count % 1000 == 0:
        now_time = time.time()
        PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(
            file_number, now_time - start_time, article_count))
        start_time = now_time

    # create links
    links_stream = io.BytesIO('')

    for i in g_links:
        (x0, y0, x1, y1, url) = g_links[i]
        links_stream.write(
            struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1,
                        link_number(url)))

    links_stream.flush()
    links = links_stream.getvalue()
    links_stream.close()

    # create language links
    links_stream = io.BytesIO('')
    japanese_convert = LanguageTranslation.LanguageJapanese().translate
    normal_convert = LanguageTranslation.LanguageNormal().translate

    for l in language_links:
        language, link = l.split(':', 1)

        language = language.strip()
        link = link.strip()

        # only need the first pronunciation for the link
        # as this must always be present
        if link is not None and '' != link:
            if 'ja' == language:
                stripped = japanese_convert(link)[0]
            else:
                stripped = normal_convert(link)[0]

            stripped = SearchKey.strip_accents(stripped)

            if link == stripped:
                links_stream.write(l.encode('utf-8') + '\0')
            else:
                links_stream.write((language + '#' +
                                    stripped).encode('utf-8') + '\1' +
                                   link.encode('utf-8') + '\0')

    links_stream.flush()
    langs = links_stream.getvalue()
    links_stream.close()

    # create the header (header size = 8)
    header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0)
    body = output.fetch()

    # combine the data
    whole_article = header + links + langs + body

    if compress:
        try:
            (article_number, fnd_offset,
             restricted) = article_index(g_this_article_title)
            restricted = bool(
                int(restricted))  # '0' is True so turn it into False
            article_writer.add_article(article_number, whole_article,
                                       fnd_offset, restricted)
        except KeyError:
            PrintLog.message(u'Error in: write_article, Title not found')
            PrintLog.message(u'Title:  {0:s}'.format(g_this_article_title))
            PrintLog.message(u'Count:  {0:s}'.format(article_count))
    else:
        f_out.write(whole_article)
        f_out.flush()
Example #47
0
def main():
    global verbose, warnings, compress
    global f_out, output, i_out
    global font_id_values
    global file_number
    global article_count
    global article_db
    global start_time
    global article_writer
    global MAXIMUM_ARTICLES_PER_BLOCK

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hvwn:d:p:i:t:f:L:l:a:b:m:', [
            'help',
            'verbose',
            'warnings',
            'number=',
            'data-prefix=',
            'index-prefix=',
            'article-index=',
            'test=',
            'font-path=',
            'language=',
            'language-links=',
            'images=',
            'articles=',
            'block-size=',
            'max-article-length=',
        ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    warnings = False
    data_file = 'pedia{0:d}.dat'
    index_file = 'pedia{0:d}.idx-tmp'
    art_file = 'articles.db'
    file_number = 0
    test_file = ''
    font_path = "../fonts"
    article_db = None
    language = 'en'
    inter_links = True
    enable_images = True
    articles_per_block = 32
    block_size = 262144
    max_article_length = 'unlimited'

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-w', '--warnings'):
            warnings = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-t', '--test'):
            test_file = arg
        elif opt in ('-i', '--article-index'):
            art_file = arg
        elif opt in ('-n', '--number'):
            try:
                file_number = int(arg)
            except ValueError:
                usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
        elif opt in ('-d', '--data-prefix'):
            data_file = arg + '{0:d}.dat'
        elif opt in ('-p', '--index-prefix'):
            index_file = arg + '{0:d}.idx-tmp'
        elif opt in ('-f', '--font-path'):
            font_path = arg
        elif opt in ('-L', '--language'):
            language = arg.lower()
        elif opt in ('-l', '--language-links'):
            arg = arg.lower()
            inter_links = ('yes' == arg)
        elif opt in ('-l', '--images'):
            arg = arg.lower()
            enable_images = ('yes' == arg)
        elif opt in ('-a', '--articles'):
            try:
                articles_per_block = int(arg)
            except ValueError:
                usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
            if articles_per_block < 1 or articles_per_block > MAXIMUM_ARTICLES_PER_BLOCK:
                usage('"{o:s}={a:s}" is out of range [1..{m:d}]'.format(
                    o=opt, a=arg, m=MAXIMUM_ARTICLES_PER_BLOCK))
        elif opt in ('-b', '--block-size'):
            try:
                block_size = int(arg)
            except ValueError:
                usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
            if block_size < 65536 or block_size > 524288:
                usage('"{0:s}={1:s}" is out of range [65536..524288]'.format(
                    opt, arg))
        elif opt in ('-m', '--max-article-length'):
            if 'unlimited' == arg.lower():
                max_article_length = 'unlimited'
            else:
                try:
                    max_article_length = int(arg)
                except ValueError:
                    usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
                if max_article_length < 0:
                    usage(
                        '"{0:s}={1:s}" is out of range [0..unlimited]'.format(
                            opt, arg))
        else:
            usage('unhandled option: ' + opt)

    start_time = time.time()

    f_fontr = open(os.path.join(font_path, "text.bmf"), "rb")
    f_fonti = open(os.path.join(font_path, "texti.bmf"), "rb")
    f_fontt = open(os.path.join(font_path, "title.bmf"), "rb")
    f_fontst = open(os.path.join(font_path, "subtitle.bmf"), "rb")
    f_font_all = open(os.path.join(font_path, "textall.bmf"), "rb")
    f_fontt_all = open(os.path.join(font_path, "titleall.bmf"), "rb")
    f_fontst_all = open(os.path.join(font_path, "subtlall.bmf"), "rb")

    font_id_values = {
        ITALIC_FONT_IDX: f_fonti,
        DEFAULT_FONT_IDX: f_fontr,
        TITLE_FONT_IDX: f_fontt,
        TITLE_ALL_FONT_IDX: f_fontt_all,
        SUBTITLE_FONT_IDX: f_fontst,
        SUBTITLE_ALL_FONT_IDX: f_fontst_all,
        DEFAULT_ALL_FONT_IDX: f_font_all
    }

    article_db = sqlite3.connect(art_file)

    article_db.execute('pragma auto_vacuum = none')
    article_db.execute('pragma synchronous = off')
    article_db.execute('pragma temp_store = memory')
    article_db.execute('pragma locking_mode = normal')
    article_db.execute('pragma read_uncommitted = true')
    article_db.execute('pragma cache_size = 20000000')
    article_db.execute('pragma default_cache_size = 20000000')
    article_db.execute('pragma journal_mode = off')

    def y_adjust(inc):
        global g_starty
        g_starty += inc

    output = EscapeBuffer.EscapeBuffer(callback=y_adjust,
                                       max_length=max_article_length)

    if test_file == '':
        compress = True
        i_out = open(index_file.format(file_number), 'wb')
        f_out = open(data_file.format(file_number), 'wb')
        article_writer = ArticleWriter(file_number,
                                       f_out,
                                       i_out,
                                       max_buckets=50,
                                       bucket_size=block_size,
                                       max_items_per_bucket=articles_per_block)
    else:
        compress = False
        f_out = open(test_file, 'wb')

    for name in args:
        f = codecs.open(name, 'r', 'utf-8', 'replace')

        t = get_parameter_value(name + '.count', 'TOTAL_ARTICLES')
        if t is not None:
            PrintLog.message("Render[{0:d}]: Total: {1:s}".format(
                file_number, t))

        WrProcess(f, language, inter_links, enable_images)
        f.close()

    for item in font_id_values:
        font_id_values[item].close()

    if output != None:
        del output

    if article_writer != None:
        del article_writer

    if f_out != None:
        f_out.close()
    if i_out != None:
        i_out.close()

    if article_db != None:
        article_db.close()

    for i in font_id_values:
        font_id_values[i].close()

    # final message
    PrintLog.message("Render[{0:d}]: Total: {1:d}".format(
        file_number, article_count))
Example #48
0
    def __del__(self):
        PrintLog.message(u'Flushing databases')
        self.template_db.commit()
        self.template_cursor.close()
        self.template_db.close()

        PrintLog.message(u'Writing: files')
        start_time = time.time()
        i = 0
        with open(self.file_import, 'w') as f:
            for filename in self.file_list:
                f.write('{0:d}\t{1:s}\n'.format(i, filename))
                i += 1
        PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

        PrintLog.message(u'Writing: articles')
        start_time = time.time()
        with open(self.article_import, 'w') as f:
            for title in self.articles:
                (article_number, fnd_offset, restricted, is_redirect) = self.articles[title]
                f.write('~' + title.encode('utf-8'))    # force string
                f.write('\t{0:d}\t{1:d}\t{2:d}\t{3:d}\n'.format(article_number, fnd_offset, restricted, is_redirect))
        PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

        PrintLog.message(u'Writing: offsets')
        start_time = time.time()
        with open(self.offset_import, 'w') as f:
            for article_number in self.offsets:
                (file_id, title, seek, length, accumulated) = self.offsets[article_number]
                f.write('{0:d}\t{1:d}\t'.format(article_number, file_id))
                f.write('~' + title.encode('utf-8'))    # force string
                f.write('\t{0:d}\t{1:d}\t{2:d}\n'.format(seek, length, accumulated))
        PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))


        PrintLog.message(u'Loading: articles')
        start_time = time.time()
        p = subprocess.Popen('sqlite3 > /dev/null 2>&1 ' + self.article_db_name, shell=True, stdin=subprocess.PIPE)
        p.stdin.write("""
create table articles (
    title varchar primary key,
    article_number integer,
    fnd_offset integer,
    restricted integer,
    is_redirect integer
);

pragma synchronous = 0;
pragma temp_store = 2;
pragma locking_mode = exclusive;
pragma cache_size = 20000000;
pragma default_cache_size = 20000000;
pragma journal_mode = memory;

.mode tabs
.import {0:s} articles
.exit
""".format(self.article_import))
        p.stdin.close()
        p.wait()
        PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

        PrintLog.message(u'Loading: offsets and files')
        start_time = time.time()
        p = subprocess.Popen('sqlite3 > /dev/null 2>&1 ' + self.offset_db_name, shell=True, stdin=subprocess.PIPE)
        p.stdin.write("""
create table offsets (
    article_number integer primary key,
    file_id integer,
    title varchar,
    seek integer,
    length integer,
    accumulated integer
);

create table files (
    file_id integer primary key,
    filename varchar
);

pragma synchronous = 0;
pragma temp_store = 2;
pragma locking_mode = exclusive;
pragma cache_size = 20000000;
pragma default_cache_size = 20000000;
pragma journal_mode = memory;

.mode tabs
.import {0:s} offsets
.import {1:s} files
.exit
""".format(self.offset_import, self.file_import))
        p.stdin.close()
        p.wait()
        PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
Example #49
0
    def handle_starttag(self, tag, attrs):

        global g_starty, g_curr_face, g_halign
        global g_this_article_title, g_links, g_link_cnt
        global warnings

        attrs = dict(attrs)

        # must always do the <html> tag
        if tag == 'html':
            self.local_init()
            self.in_html = True
            self.tag_stack = [(tag, True)]
            return

        self.tag_stack.append((tag, self.printing))

        # we want to skip content that isn't for printing
        if 'class' in attrs:
            if 'noprint' in attrs['class']:
                self.printing = False

            # create a list of language links
            if self.inter_links and tag == 'a' and 'lang-link' in attrs[
                    'class']:
                link = attrs['href']
                (lang, data) = link.split(':', 1)
                if lang != self.language:
                    self.language_links.append(link)

        # handle the tags
        if not self.printing:
            return

        elif tag == 'script':
            self.printing = False

        elif tag == 'title':
            self.in_title = True
            g_this_article_title = ''

        elif tag == 'body':
            self.in_body = True

        elif tag == 'table':
            self.in_table += 1

        # if in a table suppress everything after this point
        if self.in_table > 0:
            return

        elif tag == 'h1':
            self.flush_buffer()
            self.in_h1 = True
            esc_code0(H1_MARGIN_TOP)

        elif tag == 'h2':
            self.flush_buffer()
            self.in_h2 = True
            esc_code0(H2_MARGIN_TOP)

        elif tag == 'h3':
            self.flush_buffer()
            self.in_h3 = True
            esc_code0(H3_MARGIN_TOP)

        elif tag == 'h4':
            self.flush_buffer()
            self.in_h4 = True
            esc_code0(H4_MARGIN_TOP)

        elif tag == 'h5':
            self.flush_buffer()
            self.in_h5 = True
            esc_code0(H5_MARGIN_TOP)

        elif tag == 'h6':
            self.flush_buffer()
            self.in_h6 = True
            esc_code0(H6_MARGIN_TOP)

        elif tag == 'div':
            self.flush_buffer()
            # suppress thumb info boxes
            if 'class' in attrs:
                c = attrs['class'].lower()
                for ignore in [
                        'thumb',
                        'left',
                        'right',
                        'dablink',
                        'magnify',
                        'navframe',
                        'navtoggle',
                        'navcontent',
                ]:
                    if ignore in c:
                        self.printing = False
                        return
            esc_code0(DIV_MARGIN_TOP)

        elif tag == 'p':
            self.flush_buffer()
            self.in_p = True
            #esc_code0(P_MARGIN_TOP)

        elif tag == 'blockquote' or tag == 'pre':
            self.flush_buffer()
            self.quote += 1
            if self.quote < MAX_QUOTE_LEVEL:
                esc_code0(BLOCKQUOTE_MARGIN_TOP)
                self.indent += BLOCKQUOTE_MARGIN_LEFT
                self.lwidth -= BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
                esc_code9(BLOCKQUOTE_MARGIN_LEFT)

        elif tag == 'b':
            self.in_b = True

        elif tag == 'i':
            self.in_i = True

        elif tag == 'big':  # Not sure what to do with this one
            self.in_b = True

        elif tag == 'strong':
            self.in_b = True

        elif tag == 'del':
            self.in_del = True

        elif tag == 'ins':
            self.in_ins = True

        elif tag == 'a' and 'href' in attrs:
            self.in_a = True
            self.url = attrs['href']

        elif tag in ['ul', 'ol', 'dl']:
            if 'start' in attrs:
                list_start = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['start'])
                try:
                    list_start = int(list_start)
                except ValueError:
                    list_start = 1

                self.enter_list(tag, list_start)
            else:
                self.enter_list(tag)

        elif tag == 'li':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(
                        u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                        .format(tag, line, column, article_count + 1,
                                g_this_article_title))
                (t, p) = self.tag_stack.pop()
                return  # just ignore it
                # force ul since this is a li without a parent
                #(t, p) = self.tag_stack.pop()
                #self.tag_stack.append(('ul', p))
                #self.tag_stack.append((t,p))
                #self.enter_list('ul')

            # handle missing </li> at the same level
            # simulate </li> and continue
            if self.li_inside[self.level]:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(
                        u'Warning: missing </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                        .format(tag, line, column, article_count + 1,
                                g_this_article_title))
                (t, p) = self.tag_stack.pop()
                self.flush_buffer(False)
                self.list_decrease_indent()

            self.li_inside[self.level] = True

            if 'value' in attrs:
                list_index = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['value'])
                try:
                    self.li_cnt[self.level] = int(list_index)
                except ValueError:
                    pass
            else:
                self.li_cnt[self.level] += 1

            if self.li_type[self.level] == 'ol':
                self.wordwrap.append(
                    ("{0:d}".format(self.li_cnt[self.level])) + u".",
                    DEFAULT_FONT_IDX, None)
            else:
                if self.level > LIMAX_BULLETS:
                    bullet_num = LIMAX_BULLETS
                else:
                    bullet_num = self.level

                self.wordwrap.append(bullet_c[bullet_num], DEFAULT_FONT_IDX,
                                     None)

            self.flush_buffer()
            self.list_increase_indent()

        elif tag == 'dd':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(
                        u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                        .format(tag, line, column, article_count + 1,
                                g_this_article_title))
                (t, p) = self.tag_stack.pop()
                return  # just ignore it
            esc_code0(LIST_MARGIN_TOP)
            if not self.li_inside[self.level]:
                self.li_cnt[self.level] += 1
                self.li_inside[self.level] = True
                self.list_increase_indent()
            elif warnings:
                (line, column) = self.getpos()
                PrintLog.message(
                    u'Warning: nested <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                    .format(tag, line, column, article_count + 1,
                            g_this_article_title))

        elif tag == 'dt':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(
                        u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                        .format(tag, line, column, article_count + 1,
                                g_this_article_title))
                (t, p) = self.tag_stack.pop()
                return  # just ignore it
            # close unterminated 'dd'
            # i.e. have this  <dt>tag</dt><dd>xxxxx<dt>tag2</dt>.......
            if self.li_inside[self.level]:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(
                        u'Warning: unterminated <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                        .format('dd', line, column, article_count + 1,
                                g_this_article_title))
                (t, p) = self.tag_stack.pop()
                self.handle_endtag('dd')
                self.tag_stack.append((t, p))
            esc_code0(LIST_MARGIN_TOP)

        elif tag == 'br':
            self.flush_buffer()
            esc_code0(BR_MARGIN_TOP)
            self.in_br = True

        elif tag == 'img' and 'src' in attrs:
            # include either image or the 'alt' text
            if self.enable_images:
                (width, height, data) = get_imgdata(attrs['src'], self.indent)
                self.wordwrap.AppendImage(width, height, data, None)
            elif 'alt' in attrs:
                self.handle_data(attrs['alt'])

            self.in_img = True
Example #50
0
    def __del__(self):
        PrintLog.message(u'Flushing databases')
        self.template_db.commit()
        self.template_cursor.close()
        self.template_db.close()

        PrintLog.message(u'Writing: files')
        start_time = time.time()
        i = 0
        with open(self.file_import, 'w') as f:
            for filename in self.file_list:
                f.write('{0:d}\t{1:s}\n'.format(i, filename))
                i += 1
        PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

        PrintLog.message(u'Writing: articles')
        start_time = time.time()
        with open(self.article_import, 'w') as f:
            for title in self.articles:
                (article_number, fnd_offset, restricted, is_redirect) = self.articles[title]
                f.write('~' + title.encode('utf-8'))    # force string
                f.write('\t{0:d}\t{1:d}\t{2:d}\t{3:d}\n'.format(article_number, fnd_offset, restricted, is_redirect))
        PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

        PrintLog.message(u'Writing: offsets')
        start_time = time.time()
        with open(self.offset_import, 'w') as f:
            for article_number in self.offsets:
                (file_id, title, seek, length, accumulated) = self.offsets[article_number]
                f.write('{0:d}\t{1:d}\t'.format(article_number, file_id))
                f.write('~' + title.encode('utf-8'))    # force string
                f.write('\t{0:d}\t{1:d}\t{2:d}\n'.format(seek, length, accumulated))
        PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))


        PrintLog.message(u'Loading: articles')
        start_time = time.time()
        p = subprocess.Popen('sqlite3 > /dev/null 2>&1 ' + self.article_db_name, shell=True, stdin=subprocess.PIPE)
        p.stdin.write("""
create table articles (
    title varchar primary key,
    article_number integer,
    fnd_offset integer,
    restricted integer,
    is_redirect integer
);

pragma synchronous = 0;
pragma temp_store = 2;
pragma locking_mode = exclusive;
pragma cache_size = 20000000;
pragma default_cache_size = 20000000;
pragma journal_mode = memory;

.mode tabs
.import {0:s} articles
.exit
""".format(self.article_import))
        p.stdin.close()
        p.wait()
        PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

        PrintLog.message(u'Loading: offsets and files')
        start_time = time.time()
        p = subprocess.Popen('sqlite3 > /dev/null 2>&1 ' + self.offset_db_name, shell=True, stdin=subprocess.PIPE)
        p.stdin.write("""
create table offsets (
    article_number integer primary key,
    file_id integer,
    title varchar,
    seek integer,
    length integer,
    accumulated integer
);

create table files (
    file_id integer primary key,
    filename varchar
);

pragma synchronous = 0;
pragma temp_store = 2;
pragma locking_mode = exclusive;
pragma cache_size = 20000000;
pragma default_cache_size = 20000000;
pragma journal_mode = memory;

.mode tabs
.import {0:s} offsets
.import {1:s} files
.exit
""".format(self.offset_import, self.file_import))
        p.stdin.close()
        p.wait()
        PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
Example #51
0
def output_fnd(filename_format, article_index, language_processor, truncate_title):
    """create bigram table"""
    global bigram
    global index_matrix
    global MAXIMUM_TITLE_LENGTH
    global MAXIMUM_TITLE_ACTUAL
    global FND_FILE_SEGMENT_SIZE

    start_time = time.time()
    out_f = SegmentedFileWriter(filename_format, FND_FILE_SEGMENT_SIZE)
    PrintLog.message(u'Writing bigrams: {0:s}'.format(out_f.current_filename))

    sortedgram = [ (value, key) for key, value in bigram.iteritems() ]
    sortedgram.sort()
    sortedgram.reverse()

    bigram = {}
    i = 0
    for k, v in sortedgram:
        out_f.write(v)
        bigram[v] = chr(i + 128)
        i += 1
        if i >= 128:
            break
    while i < 128:
        out_f.write('zz')
        bigram['zz'] = chr(i + 128)
        i += 1

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    # create pfx matrix and write encoded titles

    PrintLog.message(u'Sorting titles')
    start_time = time.time()

    ####@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@####
    unique_articles = {}
    for article in [ (SearchKey.make_key(translated_title[:MAXIMUM_TITLE_LENGTH]), title)
                     for title in article_index.all_indices()
                     for translated_title in language_processor.translate(title) ]:
        unique_articles[article] = 1

    article_list = sorted(unique_articles.keys())

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    PrintLog.message(u'Writing matrix: {0:s}'.format(out_f.current_filename))
    start_time = time.time()

    index_matrix = {}
    index_matrix['\0\0\0'] = out_f.tell()

    previous_bigram_title = ''
    previous_utf8_title = ''
    mod_counter = 0

    for stripped_title, title in article_list:

        bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH]
        (article_number, dummy, restricted, is_redirect) = article_index.get_index(title)

        if '' == bigram_title and is_redirect:
            continue

        utf8_title = title.encode('utf-8')
        if truncate_title:
            utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH]
        else:
            utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL]

        offset = out_f.tell()
        article_index.set_index(title, (article_number, offset, restricted, is_redirect))

        key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower()
        key2 = key3[0:2] + '\0'
        key1 = key3[0:1] + '\0\0'
        if key1 not in index_matrix:
            index_matrix[key1] = offset
        if key2 not in index_matrix:
            index_matrix[key2] = offset
        if key3 not in index_matrix:
            index_matrix[key3] = offset

        if 0 == mod_counter & 0x0f:
            bigram_common_length = 0
            utf8_common_length = 0
        else:
            bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title)
            utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title)
        mod_counter += 1

        previous_bigram_title = bigram_title
        previous_utf8_title = utf8_title

        if bigram_common_length > 1:
            bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:]
        if utf8_common_length > 1:
            utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:]

        out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0')

    PrintLog.message(u'Final segment: {0:s}'.format(out_f.current_filename))
    out_f.close()
    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
Example #52
0
def main():
    global verbose
    global sizes
    global distribution
    global dist_list
    global total
    global byte_count

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hvd:',
                                   ['help', 'verbose', 'dir='])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    dir = 'image/enpedia'

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-d', '--dir'):
            dir = arg
        else:
            usage('unhandled option: ' + opt)

    if not os.path.isdir(dir):
        usage('{0:s} is not a directory'.format(dir))

    idx_file = open(os.path.join(dir, "wiki.idx"), "rb")
    fnd_file = open(os.path.join(dir, "wiki.fnd"), "rb")

    dat_format = os.path.join(dir, "wiki{0:d}.dat")

    index_min = 1
    index_max = struct.unpack('<I', idx_file.read(4))[0]

    PrintLog.message('Total index entries = {0:d}'.format(index_max))

    total = 0
    sizes = {}
    distribution = {}
    byte_count = {}
    dist_list = [
        100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 5000,
        7500, 10000, 20000, 50000, 100000, 200000, 500000, 99999999
    ]
    for d in dist_list:
        distribution[d] = 0
        byte_count[d] = 0

    for item in range(index_max):
        index_number = 1 + item

        if index_number not in sizes:
            process(index_number, idx_file, fnd_file, dat_format)

    PrintLog.message('{0:>10s}   {1:>20s} {2:>20s}  {3:>14s}'.format(
        'Size(<=)', 'Articles', 'Accumulated', 'Bytes'))
    sum = 0
    for i in dist_list:
        sum += distribution[i]
        PrintLog.message(
            '{0:10n} = {1:10n} {2:7.1f} % {3:10n} {4:7.1f} %  {5:14n}'.format(
                i, distribution[i], 100.0 * distribution[i] / index_max, sum,
                100.0 * sum / index_max, byte_count[i]))

    PrintLog.message('summed = {0:10n}'.format(sum))
    PrintLog.message('sizes  = {0:10n}'.format(len(sizes)))
    PrintLog.message('total  = {0:10n}'.format(total))
    idx_file.close()
    fnd_file.close()
Example #53
0
def get_imgdata(imgfile, indent):
    try:
        img = gd.image(imgfile)
    except IOError, e:
        PrintLog.message(u'unable to open image file: {0:s}'.format(imgfile))
        return (0, 0, r'')
Example #54
0
    def handle_starttag(self, tag, attrs):

        global g_starty, g_curr_face, g_halign
        global g_this_article_title, g_links, g_link_cnt
        global warnings

        attrs = dict(attrs)

        # must always do the <html> tag
        if tag == 'html':
            self.local_init()
            self.in_html = True
            self.tag_stack = [(tag, True)]
            return

        self.tag_stack.append((tag, self.printing))

        # we want to skip content that isn't for printing
        if 'class' in attrs:
            if 'noprint' in attrs['class']:
                self.printing = False

            # create a list of language links
            if self.inter_links and tag == 'a' and 'lang-link' in attrs['class']:
                self.language_links.append(attrs['href'])

        # handle the tags
        if not self.printing:
            return;

        elif tag == 'script':
            self.printing = False

        elif tag == 'title':
            self.in_title = True
            g_this_article_title = ''

        elif tag == 'body':
            self.in_body = True

        elif tag == 'table':
            self.in_table += 1

        # if in a table suppress everything after this point
        if self.in_table > 0:
            return

        elif tag == 'h1':
            self.flush_buffer()
            self.in_h1 = True
            esc_code0(H1_MARGIN_TOP)

        elif tag == 'h2':
            self.flush_buffer()
            self.in_h2 = True
            esc_code0(H2_MARGIN_TOP)

        elif tag == 'h3':
            self.flush_buffer()
            self.in_h3 = True
            esc_code0(H3_MARGIN_TOP)

        elif tag == 'h4':
            self.flush_buffer()
            self.in_h4 = True
            esc_code0(H4_MARGIN_TOP)

        elif tag == 'h5':
            self.flush_buffer()
            self.in_h5 = True
            esc_code0(H5_MARGIN_TOP)

        elif tag == 'h6':
            self.flush_buffer()
            self.in_h6 = True
            esc_code0(H6_MARGIN_TOP)

        elif tag == 'div':
            self.flush_buffer()
            # suppress thumb info boxes
            if 'class' in attrs:
                c = attrs['class']
                if 'thumb' in c or 'left' in c or 'right' in c \
                    or 'dablink' in c or 'magnify' in c:
                    self.printing = False
                    return
            esc_code0(DIV_MARGIN_TOP)

        elif tag == 'p':
            self.flush_buffer()
            self.in_p = True
            esc_code0(P_MARGIN_TOP)

        elif tag == 'blockquote':
            self.flush_buffer()
            self.quote += 1
            if self.quote < MAX_QUOTE_LEVEL:
                esc_code0(BLOCKQUOTE_MARGIN_TOP)
                self.indent += BLOCKQUOTE_MARGIN_LEFT
                self.lwidth -= BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
                esc_code9(BLOCKQUOTE_MARGIN_LEFT)

        elif tag == 'b':
            self.in_b = True

        elif tag == 'i':
            self.in_i = True

        elif tag == 'big':            # Not sure what to do with this one
            self.in_b = True

        elif tag == 'strong':
            self.in_b = True

        elif tag == 'del':
            self.in_del = True

        elif tag == 'ins':
            self.in_ins = True

        elif tag == 'a' and 'href' in attrs:
            self.in_a = True
            self.url  = attrs['href']

        elif tag in ['ul', 'ol', 'dl']:
            if 'start' in attrs:
                list_start = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['start'])
                try:
                    list_start = int(list_start)
                except ValueError:
                    list_start = 1

                self.enter_list(tag, list_start)
            else:
                self.enter_list(tag)

        elif tag == 'li':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
                (t, p) = self.tag_stack.pop()
                return  # just ignore it
                # force ul since this is a li without a parent
                #(t, p) = self.tag_stack.pop()
                #self.tag_stack.append(('ul', p))
                #self.tag_stack.append((t,p))
                #self.enter_list('ul')

            # handle missing </li> at the same level
            # simulate </li> and continue
            if self.li_inside[self.level]:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: missing </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
                (t, p) = self.tag_stack.pop()
                self.flush_buffer(False)
                self.list_decrease_indent()

            self.li_inside[self.level] = True

            if 'value' in attrs:
                list_index = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['value'])
                try:
                    self.li_cnt[self.level] = int(list_index)
                except ValueError:
                    pass
            else:
                self.li_cnt[self.level] += 1

            if self.li_type[self.level] == 'ol':
                self.wordwrap.append(("{0:d}".format(self.li_cnt[self.level])) + u".", DEFAULT_FONT_IDX, None)
            else:
                if self.level > LIMAX_BULLETS:
                    bullet_num = LIMAX_BULLETS
                else:
                    bullet_num = self.level

                self.wordwrap.append(bullet_c[bullet_num], DEFAULT_FONT_IDX, None)

            self.flush_buffer()
            self.list_increase_indent()

        elif tag == 'dd':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
                (t, p) = self.tag_stack.pop()
                return  # just ignore it
            self.li_cnt[self.level] += 1
            self.list_increase_indent()

        elif tag == 'br':
            self.in_br = True

        elif tag == 'img' and 'src' in attrs:
            # include either image or the 'alt' text
            if self.enable_images:
                (width, height, data) = get_imgdata(attrs['src'], self.indent)
                self.wordwrap.AppendImage(width, height, data, None)
            elif 'alt' in attrs:
                self.handle_data(attrs['alt'])

            self.in_img = True
Example #55
0
    def handle_endtag(self, tag):
        global g_this_article_title
        global article_count
        global warnings

        # ignore end tag without start tag
        if (tag, True) not in self.tag_stack and (tag,
                                                  False) not in self.tag_stack:
            if warnings:
                (line, column) = self.getpos()
                PrintLog.message(
                    u'Warning: superfluous </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                    .format(tag, line, column, article_count + 1,
                            g_this_article_title))
            return

        # backtrack up the stack closing each open tag until there is a match
        (start_tag, self.printing) = self.tag_stack.pop()
        while start_tag != tag:
            self.tag_stack.append((start_tag, self.printing))
            if warnings:
                (line, column) = self.getpos()
                PrintLog.message(
                    u'Warning: force </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                    .format(start_tag, line, column, article_count + 1,
                            g_this_article_title))
            self.handle_endtag(start_tag)
            (start_tag, self.printing) = self.tag_stack.pop()

        # must always do </html> tag
        if tag == 'html':
            self.printing = True
            self.tag_stack = []
            self.in_html = False
            esc_code1()
            write_article(self.language_links)
            return

        if not self.printing:
            return

        elif tag == 'script':
            pass

        elif tag == 'title':
            self.in_title = False
            g_this_article_title = g_this_article_title.strip()

        elif tag == 'body':
            self.in_body = False
            self.flush_buffer()

        elif tag == 'table':
            if self.in_table > 0:
                self.in_table -= 1

        # if in a table suppress everything after this point
        if self.in_table > 0:
            return

        elif tag == 'h1':
            self.flush_buffer()
            self.in_h1 = False
            esc_code0(H1_MARGIN_BOTTOM)
            esc_code_separate()  # force the above escape code be output

        elif tag == 'h2':
            self.flush_buffer()
            self.in_h2 = False

        elif tag == 'h3':
            self.flush_buffer()
            self.in_h3 = False

        elif tag == 'h4':
            self.flush_buffer()
            self.in_h4 = False

        elif tag == 'h5':
            self.flush_buffer()
            self.in_h5 = False

        elif tag == 'h6':
            self.flush_buffer()
            self.in_h6 = False

        elif tag == 'div':
            self.flush_buffer()

        elif tag == 'p':
            self.flush_buffer()
            self.in_p = False

        elif tag == 'blockquote' or tag == 'pre':
            self.flush_buffer()
            if self.quote > 0:
                if self.quote < MAX_QUOTE_LEVEL:
                    self.indent -= BLOCKQUOTE_MARGIN_LEFT
                    self.lwidth += BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
                    esc_code9(-BLOCKQUOTE_MARGIN_LEFT)
                self.quote -= 1

        elif tag == 'b':
            self.in_b = False

        elif tag == 'big':
            self.in_b = False

        elif tag == 'strong':
            self.in_b = False

        elif tag == 'i':
            self.in_i = False

        elif tag == 'del':
            self.in_del = False

        elif tag == 'ins':
            self.in_ins = False

        elif tag == 'a':
            self.in_a = False
            self.url = ""

        elif tag in ['ul', 'ol', 'dl']:
            self.leave_list()

        elif tag == 'li':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(
                        u'Warning: stray </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                        .format(tag, line, column, article_count + 1,
                                g_this_article_title))
            else:
                self.flush_buffer(False)
                self.list_decrease_indent()
                self.li_inside[self.level] = False

        elif tag == 'dd':
            if 0 == self.level:
                if warnings or True:
                    (line, column) = self.getpos()
                    PrintLog.message(
                        u'Warning: stray </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                        .format(tag, line, column, article_count + 1,
                                g_this_article_title))
            else:
                self.flush_buffer()
                esc_code0(LIST_MARGIN_TOP)
                if self.li_inside[self.level]:
                    self.li_inside[self.level] = False
                    self.list_decrease_indent()

        elif tag == 'dt':
            self.flush_buffer()
            esc_code0(LIST_MARGIN_TOP)

        elif tag == 'br':
            self.in_br = False

        elif tag == 'img':
            self.in_img = False
Example #56
0
def main():
    global verbose

    try:
        opts, args = getopt.getopt(sys.argv[1:],
                                   'hv',
                                   ['help',
                                    'verbose',
                                    ])
    except getopt.GetoptError as err:
        usage(err)

    verbose = False
    uint32_size = 4

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        else:
            usage('unhandled option: ' + opt)

    if len(args) < 1:
        usage('missing arguments')

    fnd_file = SegmentedFileReader(args)

    total_entries = 0

    bigram_table = {}
    for i in range(128,256):
        bigram_table[i] = fnd_file.read(2)

    previous_title1 = ''
    previous_title2 = ''

    while True:
        fnd_offset = fnd_file.tell()
        header = fnd_file.read(uint32_size + 1)

        if 0 == len(header):
            break

        article_number, nul_byte = struct.unpack('<IB', header)

        title1 = get_title(fnd_file)
        title2 = get_title(fnd_file)
        total_entries += 1

        length1 = len(title1)
        length2 = len(title2)

        if 0 != length1 and title1[0] < ' ':
            prefix_length = ord(title1[0]) + 1
            title1 = previous_title1[:prefix_length] + title1[1:]

        if 0 != length2 and title2[0] < ' ':
            prefix_length = ord(title2[0]) + 1
            title2 = previous_title2[:prefix_length] + title2[1:]

        full_length1 = len(title1)
        full_length2 = len(title2)

        decoded_title1 = ''
        for c in title1:
            i = ord(c)
            if i in bigram_table:
                decoded_title1 += bigram_table[i]
            else:
                decoded_title1 += c

        PrintLog.message(u'Index: {an:13n} @ Offset: {of:13n} [0x{of:08x}]\n'
                         u'{pad1:s}[{l1:3d}/{fl1:3d}]:{t1!r:s}\n'
                         u'{pad1:s}{pad2}{dt1!r:s}\n'
                         u'{pad1:s}[{fl1:3d}/{fl2:3d}]:"{t2:s}"\n'
                         .format(of = fnd_offset, an = article_number,
                                 l1 = length1, fl1 = full_length1, t1 = title1, dt1 = decoded_title1,
                                 pad1 = ' ' * 2, pad2 = ' ' * (2 * 3 + 4),
                                 l2 = length2, fl2 = full_length2, t2 = truncated_utf8(title2)))

        previous_title1 = title1
        previous_title2 = title2

    fnd_file.close()

    PrintLog.message(u'Total entries  = {0:13n}'.format(total_entries))
Example #57
0
 def handle_entityref(self, name):
     """handle &amp; &gt; ..."""
     try:
         self.handle_data(unichr(htmlentitydefs.name2codepoint[name]))
     except KeyError:
         PrintLog.message(u'ENTITYREF ERROR: {0:s} article: {1:s}'.format(name, g_this_article_title))
Example #58
0
def process(index_number, idx_file, fnd_file, dat_format, extract):
    """dump the index and fnd file entries"""

    PrintLog.message('Index number = {0:13n} [0x{0:08x}]'.format(index_number))
    PrintLog.message('')

    uint32_size = 4
    index_entry_size = 2 * uint32_size + 1

    index_offset = uint32_size + index_entry_size * (index_number - 1)
    idx_file.seek(index_offset)
    offset_dat, offset_fnd, file_id = struct.unpack(
        '<2IB', idx_file.read(index_entry_size))

    data_file_name = dat_format.format(file_id)

    PrintLog.message('Index offset = {0:13n} [0x{0:08x}]'.format(index_offset))
    PrintLog.message('Data offset  = {0:13n} [0x{0:08x}]'.format(offset_dat))
    PrintLog.message('FND offset   = {0:13n} [0x{0:08x}]'.format(offset_fnd))
    PrintLog.message('File ID      = {0:13n} [0x{0:08x}] => "{1:s}"'.format(
        file_id, data_file_name))

    fnd_file.seek(offset_fnd)
    article_index_check = struct.unpack('<I', fnd_file.read(uint32_size))[0]

    index_match = '(Matches)' if article_index_check == index_number else '(**MISMATCHED INDEX**)'
    PrintLog.message('FND index    = {0:13n} [0x{0:08x}] {1:s}'.format(
        article_index_check, index_match))

    ignored = fnd_file.read(1)  # skip nul byte
    titles = fnd_file.read(1024).split('\0')  # >= 2 * MAX_TITLE_SEARCH

    PrintLog.message('FND title    = "{0!r:s}"'.format(
        titles[1]))  # actual title

    dat_file = open(data_file_name, 'rb')
    dat_file.seek(offset_dat)

    number_of_pages = struct.unpack('<B', dat_file.read(1))[0]
    PrintLog.message(
        'Data Pages   = {0:13n} [0x{0:08x}]'.format(number_of_pages))
    PrintLog.message('')

    total_article_bytes = 0
    PrintLog.message('{0:>29s}{1:>25s}{2:>25s}'.format('Article Number',
                                                       'Article Offset',
                                                       'Uncompressed Length'))

    for i in range(0, number_of_pages):
        page_id, page_offset, page_length = struct.unpack(
            '<3I', dat_file.read(3 * uint32_size))
        restricted = 'Restricted' if (0 != page_offset & 0x80000000) else ''
        page_offset = page_offset & 0x7fffffff
        total_article_bytes += page_length
        PrintLog.message(
            '{0:3d}:  {1:10n} [0x{1:08x}]  {2:10n} [0x{2:08x}]  {3:10n} [0x{3:08x}]  {4:s}'
            .format(i, page_id, page_offset, page_length, restricted))

    PrintLog.message('{0:<{1}s}{2:10n} [0x{2:08x}]'.format(
        'Total bytes: ', 3 + 3 + 10 + 4 + 8 + 3 + 10 + 4 + 8 + 3,
        total_article_bytes))
    PrintLog.message('')

    data_length = struct.unpack('<I', dat_file.read(4))[0]
    PrintLog.message('DataLength  = {0:13n} [0x{0:08x}]'.format(data_length))

    article_data = dat_file.read(data_length)
    dat_file.close()
    if extract is not None:
        output_file_name = extract + '-I' + str(index_number) + '-b' + str(
            data_length) + '.articles'
        PrintLog.message('Extracting uncompressed articles to: {0:s}'.format(
            output_file_name))
        out = open(output_file_name, 'wb')
        out.write(DecompressData(article_data))
        out.close()

    PrintLog.message('')
Example #59
0
def write_article(language_links):
    global compress
    global verbose
    global output, f_out, i_out
    global article_count
    global g_this_article_title
    global file_number
    global start_time
    global article_writer

    article_count += 1
    if verbose:
        PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title))

    elif article_count % 1000 == 0:
        now_time = time.time()
        PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(file_number, now_time - start_time, article_count))
        start_time = now_time

    output.flush()

    # create links
    links_stream = io.BytesIO('')

    for i in g_links:
        (x0, y0, x1, y1, url) = g_links[i]
        links_stream.write(struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url)))

    links_stream.flush()
    links = links_stream.getvalue()
    links_stream.close()

    # create language links
    links_stream = io.BytesIO('')
    japanese_convert = LanguageTranslation.LanguageJapanese().translate
    normal_convert = LanguageTranslation.LanguageNormal().translate

    for l in language_links:
        language, link = l.split(':', 1)

        if 'ja' == language:
            stripped = japanese_convert(link)
        else:
            stripped = normal_convert(link)

        if link == stripped:
            links_stream.write(l.encode('utf-8') + '\0')
        else:
            links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0')

    links_stream.flush()
    langs = links_stream.getvalue()
    links_stream.close()

    # create the header (header size = 8)
    header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0)
    body = output.getvalue()

    # combine the data
    whole_article = header + links + langs + body

    if compress:
        try:
            (article_number, fnd_offset, restricted) = article_index(g_this_article_title)
            restricted =  bool(int(restricted))  # '0' is True so turn it into False
            article_writer.add_article(article_number, whole_article, fnd_offset, restricted)
        except KeyError:
            PrintLog.message(u'Error in: write_article, Title not found')
            PrintLog.message(u'Title:  {0:s}'.format(g_this_article_title))
            PrintLog.message(u'Offset: {0:s}'.format(file_offset))
            PrintLog.message(u'Count:  {0:s}'.format(article_count))
    else:
        f_out.write(whole_article)
        f_out.flush()

    # Note: some versions of Python do not move file position on truncate
    #       so an explicit seek is needed to avoid nul padding bytes.
    output.seek(0)
    output.truncate(0)