def bigram_encode(title): """encode a title in bigram form""" global bigram result = '' title = SearchKey.strip_accents(title) while len(title) >= 2: if SearchKey.is_valid_character(title[0]): b = title[0:2] if b in bigram: result += bigram[b] title = title[2:] else: result += chr(ord(title[0:1])) title = title[1:] else: #result += '?' title = title[1:] if len(title) == 1: if SearchKey.is_valid_character(title[0]): result += chr(ord(title[0])) #else: # result += '?' return SearchKey.compact_spaces(result)
def translate(self, text): """take Japanese string and convert to Roman letters""" result = [] for text in super(type(self), self).translate(text): split_text = ''.join([ c if not c in SearchKey.unsupported_punctuation() else ' ' for c in list(text) ]).split() for tt in split_text: if type(tt) == unicode: tt = tt.encode('utf-8') phonetics = self.get_phonetics(tt) #result = super(type(self), self).append_translations(result, phonetics, ' ') # *** nasty hack to make sure the number of translations does not exceed 10000 # *** as some Japanese phrases can have hundreds of millions of possible pronunciations # *** e.g. 平親清女・平親清女妹・平親清四女・平親清五女 # *** 120 * 360 * 120 * 120 -> 622,080,000 # *** just cut the arrays to the first 100 elements result = super(type(self), self).append_translations( result[:100], phonetics[:100], ' ') if result is None or [] == result or '' == result: return [''] return result
def generate_bigram(text): """create bigram from pairs of characters""" global bigram if len(text) > 2: try: if SearchKey.is_valid_character(text[0]) and SearchKey.is_valid_character(text[1]): bigram[text[0:2]] += 1 except KeyError: bigram[text[0:2]] = 1 if len(text) > 4: try: if SearchKey.is_valid_character(text[2]) and SearchKey.is_valid_character(text[3]): bigram[text[2:4]] += 1 except KeyError: bigram[text[2:4]] = 1
def redirect(self, category, key, title, rcategory, rkey, rtitle, seek): global verbose title = self.convert(title).strip(u'\u200e\u200f') rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f') # redirected title may contain '%xx' items - treat as unicode sequence # if it fails just keep the %xx sequences intact since it must represent # either real %xx or some unknowable coding scheme try: rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')), 'utf-8').strip().strip(u'\u200e\u200f') except UnicodeDecodeError: pass rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip() if self.KEY_TEMPLATE == key: if title != rtitle: title = unicode( category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) rtitle = unicode( rcategory, 'utf-8').capitalize() + ':' + upper_case_first_char(rtitle) self.template_cursor.execute( u'insert or replace into redirects (title, redirect) values(?, ?)', [ u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~{0:d}~{1:s}'.format(self.file_id(), rtitle) ]) self.template_redirect_count += 1 return if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey: if verbose: PrintLog.message( u'Non-article Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(unicode(category, 'utf-8'), key, title, unicode(rcategory, 'utf-8'), rkey, rtitle)) return if '' == rtitle: PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format( category, key, title)) else: self.redirects[title] = rtitle self.redirect_count += 1 for t in self.language_processor.translate(title): generate_bigram(t) if verbose: PrintLog.message( u'Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}'. format(category, key, title, rcategory, rkey, rtitle))
def redirect(self, category, key, title, rcategory, rkey, rtitle, seek): global verbose title = self.convert(title).strip(u'\u200e\u200f') rtitle = self.convert(rtitle).strip().strip(u'\u200e\u200f') # redirected title may contain '%xx' items - treat as unicode sequence # if it fails just keep the %xx sequences intact since it must represent # either real %xx or some unknowable coding scheme try: rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')), 'utf-8').strip().strip(u'\u200e\u200f') except UnicodeDecodeError: pass rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip() if self.KEY_TEMPLATE == key: if title != rtitle: title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) rtitle = unicode(rcategory, 'utf-8').capitalize() + ':' + upper_case_first_char(rtitle) self.template_cursor.execute(u'insert or replace into redirects (title, redirect) values(?, ?)', [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)]) self.template_redirect_count += 1 return if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey: if verbose: PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(unicode(category, 'utf-8'), key, title, unicode(rcategory, 'utf-8'), rkey, rtitle)) return if '' == rtitle: PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(category, key, title)) else: self.redirects[title] = rtitle self.redirect_count += 1 for t in self.language_processor.translate(title): generate_bigram(t) if verbose: PrintLog.message(u'Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}' .format(category, key, title, rcategory, rkey, rtitle))
def output_pfx(filename): """output the pfx matrix""" global index_matrix PrintLog.message(u'Writing: {0:s}'.format(filename)) start_time = time.time() out_f = open(filename, 'wb') list = '\0' + SearchKey.all_characters() for k1 in list: for k2 in list: for k3 in list: key = k1+k2+k3 if key in index_matrix: offset = index_matrix[key] else: offset = 0 out_f.write(struct.pack('<I', offset)) out_f.close() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
def translate(self, text): """take Japanese string and convert to Roman letters""" result = [] for text in super(type(self), self).translate(text): split_text = ''.join([ c if not c in SearchKey.unsupported_punctuation() else ' ' for c in list(text)]).split() for tt in split_text: if type(tt) == unicode: tt = tt.encode('utf-8') phonetics = self.get_phonetics(tt) #result = super(type(self), self).append_translations(result, phonetics, ' ') # *** nasty hack to make sure the number of translations does not exceed 10000 # *** as some Japanese phrases can have hundreds of millions of possible pronunciations # *** e.g. 平親清女・平親清女妹・平親清四女・平親清五女 # *** 120 * 360 * 120 * 120 -> 622,080,000 # *** just cut the arrays to the first 100 elements result = super(type(self), self).append_translations(result[:100], phonetics[:100], ' ') if result is None or [] == result or '' == result: return [''] return result
def output_fnd(filename, article_index, language_processor, truncate_title): """create bigram table""" global bigram global index_matrix global MAXIMUM_TITLE_LENGTH global MAXIMUM_TITLE_ACTUAL PrintLog.message(u'Writing bigrams: {0:s}'.format(filename)) start_time = time.time() out_f = open(filename, 'wb') sortedgram = [ (value, key) for key, value in bigram.iteritems() ] sortedgram.sort() sortedgram.reverse() bigram = {} i = 0 for k, v in sortedgram: out_f.write(v) bigram[v] = chr(i + 128) i += 1 if i >= 128: break while i < 128: out_f.write('zz') bigram['zz'] = chr(i + 128) i += 1 PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) # create pfx matrix and write encoded titles #article_list = [strip_accents(k) for k in article_index.keys()] #article_list.sort(key = lambda x: strip_accents(x).lower()) PrintLog.message(u'Sorting titles') start_time = time.time() article_list = [ (SearchKey.make_key(language_processor.translate(title)), title) for title in article_index.all_indices() ] article_list.sort() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) PrintLog.message(u'Writing matrix: {0:s}'.format(filename)) start_time = time.time() index_matrix = {} index_matrix['\0\0\0'] = out_f.tell() previous_bigram_title = '' previous_utf8_title = '' mod_counter = 0 for stripped_title, title in article_list: bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH] (article_number, dummy, restricted, is_redirect) = article_index.get_index(title) if '' == bigram_title and is_redirect: continue utf8_title = title.encode('utf-8') if truncate_title: utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH] else: utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL] offset = out_f.tell() article_index.set_index(title, (article_number, offset, restricted, is_redirect)) key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower() key2 = key3[0:2] + '\0' key1 = key3[0:1] + '\0\0' if key1 not in index_matrix: index_matrix[key1] = offset if key2 not in index_matrix: index_matrix[key2] = offset if key3 not in index_matrix: index_matrix[key3] = offset if 0 == mod_counter & 0x0f: bigram_common_length = 0 utf8_common_length = 0 else: bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title) utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title) mod_counter += 1 previous_bigram_title = bigram_title previous_utf8_title = utf8_title if bigram_common_length > 1: bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:] if utf8_common_length > 1: utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:] out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0') out_f.close() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
def write_article(language_links): global compress global verbose global output, f_out, i_out global article_count global g_this_article_title global file_number global start_time global article_writer article_count += 1 if verbose: PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title)) elif article_count % 1000 == 0: now_time = time.time() PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(file_number, now_time - start_time, article_count)) start_time = now_time # create links links_stream = io.BytesIO('') for i in g_links: (x0, y0, x1, y1, url) = g_links[i] links_stream.write(struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url))) links_stream.flush() links = links_stream.getvalue() links_stream.close() # create language links links_stream = io.BytesIO('') japanese_convert = LanguageTranslation.LanguageJapanese().translate normal_convert = LanguageTranslation.LanguageNormal().translate for l in language_links: language, link = l.split(':', 1) language = language.strip() link = link.strip() # only need the first pronunciation for the link # as this must always be present if link is not None and '' != link: if 'ja' == language: stripped = japanese_convert(link)[0] else: stripped = normal_convert(link)[0] stripped = SearchKey.strip_accents(stripped) if link == stripped: links_stream.write(l.encode('utf-8') + '\0') else: links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0') links_stream.flush() langs = links_stream.getvalue() links_stream.close() # create the header (header size = 8) header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0) body = output.fetch() # combine the data whole_article = header + links + langs + body if compress: try: (article_number, fnd_offset, restricted) = article_index(g_this_article_title) restricted = bool(int(restricted)) # '0' is True so turn it into False article_writer.add_article(article_number, whole_article, fnd_offset, restricted) except KeyError: PrintLog.message(u'Error in: write_article, Title not found') PrintLog.message(u'Title: {0:s}'.format(g_this_article_title)) PrintLog.message(u'Count: {0:s}'.format(article_count)) else: f_out.write(whole_article) f_out.flush()
def write_article(language_links): global compress global verbose global output, f_out, i_out global article_count global g_this_article_title global file_number global start_time global article_writer article_count += 1 if verbose: PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title)) elif article_count % 1000 == 0: now_time = time.time() PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format( file_number, now_time - start_time, article_count)) start_time = now_time # create links links_stream = io.BytesIO('') for i in g_links: (x0, y0, x1, y1, url) = g_links[i] links_stream.write( struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url))) links_stream.flush() links = links_stream.getvalue() links_stream.close() # create language links links_stream = io.BytesIO('') japanese_convert = LanguageTranslation.LanguageJapanese().translate normal_convert = LanguageTranslation.LanguageNormal().translate for l in language_links: language, link = l.split(':', 1) language = language.strip() link = link.strip() # only need the first pronunciation for the link # as this must always be present if link is not None and '' != link: if 'ja' == language: stripped = japanese_convert(link)[0] else: stripped = normal_convert(link)[0] stripped = SearchKey.strip_accents(stripped) if link == stripped: links_stream.write(l.encode('utf-8') + '\0') else: links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0') links_stream.flush() langs = links_stream.getvalue() links_stream.close() # create the header (header size = 8) header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0) body = output.fetch() # combine the data whole_article = header + links + langs + body if compress: try: (article_number, fnd_offset, restricted) = article_index(g_this_article_title) restricted = bool( int(restricted)) # '0' is True so turn it into False article_writer.add_article(article_number, whole_article, fnd_offset, restricted) except KeyError: PrintLog.message(u'Error in: write_article, Title not found') PrintLog.message(u'Title: {0:s}'.format(g_this_article_title)) PrintLog.message(u'Count: {0:s}'.format(article_count)) else: f_out.write(whole_article) f_out.flush()
def output_fnd(filename_format, article_index, language_processor, truncate_title): """create bigram table""" global bigram global index_matrix global MAXIMUM_TITLE_LENGTH global MAXIMUM_TITLE_ACTUAL global FND_FILE_SEGMENT_SIZE start_time = time.time() out_f = SegmentedFileWriter(filename_format, FND_FILE_SEGMENT_SIZE) PrintLog.message(u'Writing bigrams: {0:s}'.format(out_f.current_filename)) sortedgram = [ (value, key) for key, value in bigram.iteritems() ] sortedgram.sort() sortedgram.reverse() bigram = {} i = 0 for k, v in sortedgram: out_f.write(v) bigram[v] = chr(i + 128) i += 1 if i >= 128: break while i < 128: out_f.write('zz') bigram['zz'] = chr(i + 128) i += 1 PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) # create pfx matrix and write encoded titles PrintLog.message(u'Sorting titles') start_time = time.time() ####@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@#### unique_articles = {} for article in [ (SearchKey.make_key(translated_title[:MAXIMUM_TITLE_LENGTH]), title) for title in article_index.all_indices() for translated_title in language_processor.translate(title) ]: unique_articles[article] = 1 article_list = sorted(unique_articles.keys()) PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) PrintLog.message(u'Writing matrix: {0:s}'.format(out_f.current_filename)) start_time = time.time() index_matrix = {} index_matrix['\0\0\0'] = out_f.tell() previous_bigram_title = '' previous_utf8_title = '' mod_counter = 0 for stripped_title, title in article_list: bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH] (article_number, dummy, restricted, is_redirect) = article_index.get_index(title) if '' == bigram_title and is_redirect: continue utf8_title = title.encode('utf-8') if truncate_title: utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH] else: utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL] offset = out_f.tell() article_index.set_index(title, (article_number, offset, restricted, is_redirect)) key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower() key2 = key3[0:2] + '\0' key1 = key3[0:1] + '\0\0' if key1 not in index_matrix: index_matrix[key1] = offset if key2 not in index_matrix: index_matrix[key2] = offset if key3 not in index_matrix: index_matrix[key3] = offset if 0 == mod_counter & 0x0f: bigram_common_length = 0 utf8_common_length = 0 else: bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title) utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title) mod_counter += 1 previous_bigram_title = bigram_title previous_utf8_title = utf8_title if bigram_common_length > 1: bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:] if utf8_common_length > 1: utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:] out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0') PrintLog.message(u'Final segment: {0:s}'.format(out_f.current_filename)) out_f.close() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))