def bigram_encode(title): """encode a title in bigram form""" global bigram result = '' title = SearchKey.strip_accents(title) while len(title) >= 2: if SearchKey.is_valid_character(title[0]): b = title[0:2] if b in bigram: result += bigram[b] title = title[2:] else: result += chr(ord(title[0:1])) title = title[1:] else: #result += '?' title = title[1:] if len(title) == 1: if SearchKey.is_valid_character(title[0]): result += chr(ord(title[0])) #else: # result += '?' return SearchKey.compact_spaces(result)
def redirect(self, category, key, title, rcategory, rkey, rtitle, seek): global verbose title = self.convert(title).strip(u'\u200e\u200f') rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f') # redirected title may contain '%xx' items - treat as unicode sequence # if it fails just keep the %xx sequences intact since it must represent # either real %xx or some unknowable coding scheme try: rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')), 'utf-8').strip().strip(u'\u200e\u200f') except UnicodeDecodeError: pass rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip() if self.KEY_TEMPLATE == key: if title != rtitle: title = unicode( category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) rtitle = unicode( rcategory, 'utf-8').capitalize() + ':' + upper_case_first_char(rtitle) self.template_cursor.execute( u'insert or replace into redirects (title, redirect) values(?, ?)', [ u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~{0:d}~{1:s}'.format(self.file_id(), rtitle) ]) self.template_redirect_count += 1 return if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey: if verbose: PrintLog.message( u'Non-article Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(unicode(category, 'utf-8'), key, title, unicode(rcategory, 'utf-8'), rkey, rtitle)) return if '' == rtitle: PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format( category, key, title)) else: self.redirects[title] = rtitle self.redirect_count += 1 for t in self.language_processor.translate(title): generate_bigram(t) if verbose: PrintLog.message( u'Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}'. format(category, key, title, rcategory, rkey, rtitle))
def redirect(self, category, key, title, rcategory, rkey, rtitle, seek): global verbose title = self.convert(title).strip(u'\u200e\u200f') rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f') # redirected title may contain '%xx' items - treat as unicode sequence # if it fails just keep the %xx sequences intact since it must represent # either real %xx or some unknowable coding scheme try: rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')), 'utf-8').strip().strip(u'\u200e\u200f') except UnicodeDecodeError: pass rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip() if self.KEY_TEMPLATE == key: if title != rtitle: title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) rtitle = unicode(rcategory, 'utf-8').capitalize() + ':' + upper_case_first_char(rtitle) self.template_cursor.execute(u'insert or replace into redirects (title, redirect) values(?, ?)', [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)]) self.template_redirect_count += 1 return if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey: if verbose: PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(unicode(category, 'utf-8'), key, title, unicode(rcategory, 'utf-8'), rkey, rtitle)) return if '' == rtitle: PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(category, key, title)) else: self.redirects[title] = rtitle self.redirect_count += 1 for t in self.language_processor.translate(title): generate_bigram(t) if verbose: PrintLog.message(u'Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(category, key, title, rcategory, rkey, rtitle))