Esempio n. 1
0
def bigram_encode(title):
    """encode a title in bigram form"""
    global bigram

    result = ''
    title = SearchKey.strip_accents(title)

    while len(title) >= 2:
        if SearchKey.is_valid_character(title[0]):

            b = title[0:2]
            if b in bigram:
                result += bigram[b]
                title = title[2:]
            else:
                result += chr(ord(title[0:1]))
                title = title[1:]
        else:
            #result += '?'
            title = title[1:]
    if len(title) == 1:
        if SearchKey.is_valid_character(title[0]):
            result += chr(ord(title[0]))
        #else:
        #    result += '?'

    return SearchKey.compact_spaces(result)
Esempio n. 2
0
def bigram_encode(title):
    """encode a title in bigram form"""
    global bigram

    result = ''
    title = SearchKey.strip_accents(title)

    while len(title) >= 2:
        if SearchKey.is_valid_character(title[0]):

            b = title[0:2]
            if b in bigram:
                result += bigram[b]
                title = title[2:]
            else:
                result += chr(ord(title[0:1]))
                title = title[1:]
        else:
            #result += '?'
            title = title[1:]
    if len(title) == 1:
        if SearchKey.is_valid_character(title[0]):
            result += chr(ord(title[0]))
        #else:
        #    result += '?'

    return SearchKey.compact_spaces(result)
Esempio n. 3
0
def write_article(language_links):
    global compress
    global verbose
    global output, f_out, i_out
    global article_count
    global g_this_article_title
    global file_number
    global start_time
    global article_writer

    article_count += 1
    if verbose:
        PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title))

    elif article_count % 1000 == 0:
        now_time = time.time()
        PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(file_number, now_time - start_time, article_count))
        start_time = now_time

    # create links
    links_stream = io.BytesIO('')

    for i in g_links:
        (x0, y0, x1, y1, url) = g_links[i]
        links_stream.write(struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url)))

    links_stream.flush()
    links = links_stream.getvalue()
    links_stream.close()

    # create language links
    links_stream = io.BytesIO('')
    japanese_convert = LanguageTranslation.LanguageJapanese().translate
    normal_convert = LanguageTranslation.LanguageNormal().translate

    for l in language_links:
        language, link = l.split(':', 1)

        language = language.strip()
        link = link.strip()

        # only need the first pronunciation for the link
        # as this must always be present
        if link is not None and '' != link:
            if 'ja' == language:
                stripped = japanese_convert(link)[0]
            else:
                stripped = normal_convert(link)[0]

            stripped = SearchKey.strip_accents(stripped)

            if link == stripped:
                links_stream.write(l.encode('utf-8') + '\0')
            else:
                links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0')

    links_stream.flush()
    langs = links_stream.getvalue()
    links_stream.close()

    # create the header (header size = 8)
    header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0)
    body = output.fetch()

    # combine the data
    whole_article = header + links + langs + body

    if compress:
        try:
            (article_number, fnd_offset, restricted) = article_index(g_this_article_title)
            restricted =  bool(int(restricted))  # '0' is True so turn it into False
            article_writer.add_article(article_number, whole_article, fnd_offset, restricted)
        except KeyError:
            PrintLog.message(u'Error in: write_article, Title not found')
            PrintLog.message(u'Title:  {0:s}'.format(g_this_article_title))
            PrintLog.message(u'Count:  {0:s}'.format(article_count))
    else:
        f_out.write(whole_article)
        f_out.flush()
Esempio n. 4
0
def write_article(language_links):
    global compress
    global verbose
    global output, f_out, i_out
    global article_count
    global g_this_article_title
    global file_number
    global start_time
    global article_writer

    article_count += 1
    if verbose:
        PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count,
                                                     g_this_article_title))

    elif article_count % 1000 == 0:
        now_time = time.time()
        PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(
            file_number, now_time - start_time, article_count))
        start_time = now_time

    # create links
    links_stream = io.BytesIO('')

    for i in g_links:
        (x0, y0, x1, y1, url) = g_links[i]
        links_stream.write(
            struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1,
                        link_number(url)))

    links_stream.flush()
    links = links_stream.getvalue()
    links_stream.close()

    # create language links
    links_stream = io.BytesIO('')
    japanese_convert = LanguageTranslation.LanguageJapanese().translate
    normal_convert = LanguageTranslation.LanguageNormal().translate

    for l in language_links:
        language, link = l.split(':', 1)

        language = language.strip()
        link = link.strip()

        # only need the first pronunciation for the link
        # as this must always be present
        if link is not None and '' != link:
            if 'ja' == language:
                stripped = japanese_convert(link)[0]
            else:
                stripped = normal_convert(link)[0]

            stripped = SearchKey.strip_accents(stripped)

            if link == stripped:
                links_stream.write(l.encode('utf-8') + '\0')
            else:
                links_stream.write((language + '#' +
                                    stripped).encode('utf-8') + '\1' +
                                   link.encode('utf-8') + '\0')

    links_stream.flush()
    langs = links_stream.getvalue()
    links_stream.close()

    # create the header (header size = 8)
    header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0)
    body = output.fetch()

    # combine the data
    whole_article = header + links + langs + body

    if compress:
        try:
            (article_number, fnd_offset,
             restricted) = article_index(g_this_article_title)
            restricted = bool(
                int(restricted))  # '0' is True so turn it into False
            article_writer.add_article(article_number, whole_article,
                                       fnd_offset, restricted)
        except KeyError:
            PrintLog.message(u'Error in: write_article, Title not found')
            PrintLog.message(u'Title:  {0:s}'.format(g_this_article_title))
            PrintLog.message(u'Count:  {0:s}'.format(article_count))
    else:
        f_out.write(whole_article)
        f_out.flush()