コード例 #1
0
ファイル: ArticleIndex.py プロジェクト: nzmichaelh/wrdk
def bigram_encode(title):
    """encode a title in bigram form"""
    global bigram

    result = ''
    title = SearchKey.strip_accents(title)

    while len(title) >= 2:
        if SearchKey.is_valid_character(title[0]):

            b = title[0:2]
            if b in bigram:
                result += bigram[b]
                title = title[2:]
            else:
                result += chr(ord(title[0:1]))
                title = title[1:]
        else:
            #result += '?'
            title = title[1:]
    if len(title) == 1:
        if SearchKey.is_valid_character(title[0]):
            result += chr(ord(title[0]))
        #else:
        #    result += '?'

    return SearchKey.compact_spaces(result)
コード例 #2
0
def bigram_encode(title):
    """encode a title in bigram form"""
    global bigram

    result = ''
    title = SearchKey.strip_accents(title)

    while len(title) >= 2:
        if SearchKey.is_valid_character(title[0]):

            b = title[0:2]
            if b in bigram:
                result += bigram[b]
                title = title[2:]
            else:
                result += chr(ord(title[0:1]))
                title = title[1:]
        else:
            #result += '?'
            title = title[1:]
    if len(title) == 1:
        if SearchKey.is_valid_character(title[0]):
            result += chr(ord(title[0]))
        #else:
        #    result += '?'

    return SearchKey.compact_spaces(result)
コード例 #3
0
ファイル: ArticleIndex.py プロジェクト: yuben75/wikireader
    def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
        global verbose

        title = self.convert(title).strip(u'\u200e\u200f')

        rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f')

        # redirected title may contain '%xx' items - treat as unicode sequence
        # if it fails just keep the %xx sequences intact since it must represent
        # either real %xx or some unknowable coding scheme
        try:
            rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')),
                             'utf-8').strip().strip(u'\u200e\u200f')
        except UnicodeDecodeError:
            pass

        rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip()

        if self.KEY_TEMPLATE == key:
            if title != rtitle:
                title = unicode(
                    category,
                    'utf-8').capitalize() + ':' + upper_case_first_char(title)
                rtitle = unicode(
                    rcategory,
                    'utf-8').capitalize() + ':' + upper_case_first_char(rtitle)
                self.template_cursor.execute(
                    u'insert or replace into redirects (title, redirect) values(?, ?)',
                    [
                        u'~{0:d}~{1:s}'.format(self.file_id(), title),
                        u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)
                    ])

            self.template_redirect_count += 1
            return

        if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey:
            if verbose:
                PrintLog.message(
                    u'Non-article Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                    .format(unicode(category, 'utf-8'), key, title,
                            unicode(rcategory, 'utf-8'), rkey, rtitle))
            return

        if '' == rtitle:
            PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(
                category, key, title))
        else:
            self.redirects[title] = rtitle
            self.redirect_count += 1

            for t in self.language_processor.translate(title):
                generate_bigram(t)

            if verbose:
                PrintLog.message(
                    u'Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'.
                    format(category, key, title, rcategory, rkey, rtitle))
コード例 #4
0
ファイル: ArticleIndex.py プロジェクト: 9072997/wikireader
    def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
        global verbose

        title = self.convert(title).strip(u'\u200e\u200f')

        rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f')

        # redirected title may contain '%xx' items - treat as unicode sequence
        # if it fails just keep the %xx sequences intact since it must represent
        # either real %xx or some unknowable coding scheme
        try:
            rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')), 'utf-8').strip().strip(u'\u200e\u200f')
        except UnicodeDecodeError:
            pass

        rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip()

        if self.KEY_TEMPLATE == key:
            if title != rtitle:
                title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title)
                rtitle = unicode(rcategory, 'utf-8').capitalize() + ':' + upper_case_first_char(rtitle)
                self.template_cursor.execute(u'insert or replace into redirects (title, redirect) values(?, ?)',
                                             [u'~{0:d}~{1:s}'.format(self.file_id(), title),
                                              u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)])

            self.template_redirect_count += 1
            return

        if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey:
            if verbose:
                PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                                 .format(unicode(category, 'utf-8'), key, title,
                                         unicode(rcategory, 'utf-8'), rkey, rtitle))
            return

        if '' == rtitle:
            PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(category, key, title))
        else:
            self.redirects[title] = rtitle
            self.redirect_count += 1

            for t in self.language_processor.translate(title):
                generate_bigram(t)

            if verbose:
                PrintLog.message(u'Redirect: {0:s}[{1:d}]:{2:s} ->  {3:s}[{4:d}]:{5:s}'
                                 .format(category, key, title, rcategory, rkey, rtitle))