def output_fnd(filename, article_index, language_processor, truncate_title): """create bigram table""" global bigram global index_matrix global MAXIMUM_TITLE_LENGTH global MAXIMUM_TITLE_ACTUAL PrintLog.message(u'Writing bigrams: {0:s}'.format(filename)) start_time = time.time() out_f = open(filename, 'wb') sortedgram = [ (value, key) for key, value in bigram.iteritems() ] sortedgram.sort() sortedgram.reverse() bigram = {} i = 0 for k, v in sortedgram: out_f.write(v) bigram[v] = chr(i + 128) i += 1 if i >= 128: break while i < 128: out_f.write('zz') bigram['zz'] = chr(i + 128) i += 1 PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) # create pfx matrix and write encoded titles #article_list = [strip_accents(k) for k in article_index.keys()] #article_list.sort(key = lambda x: strip_accents(x).lower()) PrintLog.message(u'Sorting titles') start_time = time.time() article_list = [ (SearchKey.make_key(language_processor.translate(title)), title) for title in article_index.all_indices() ] article_list.sort() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) PrintLog.message(u'Writing matrix: {0:s}'.format(filename)) start_time = time.time() index_matrix = {} index_matrix['\0\0\0'] = out_f.tell() previous_bigram_title = '' previous_utf8_title = '' mod_counter = 0 for stripped_title, title in article_list: bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH] (article_number, dummy, restricted, is_redirect) = article_index.get_index(title) if '' == bigram_title and is_redirect: continue utf8_title = title.encode('utf-8') if truncate_title: utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH] else: utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL] offset = out_f.tell() article_index.set_index(title, (article_number, offset, restricted, is_redirect)) key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower() key2 = key3[0:2] + '\0' key1 = key3[0:1] + '\0\0' if key1 not in index_matrix: index_matrix[key1] = offset if key2 not in index_matrix: index_matrix[key2] = offset if key3 not in index_matrix: index_matrix[key3] = offset if 0 == mod_counter & 0x0f: bigram_common_length = 0 utf8_common_length = 0 else: bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title) utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title) mod_counter += 1 previous_bigram_title = bigram_title previous_utf8_title = utf8_title if bigram_common_length > 1: bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:] if utf8_common_length > 1: utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:] out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0') out_f.close() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
def output_fnd(filename_format, article_index, language_processor, truncate_title): """create bigram table""" global bigram global index_matrix global MAXIMUM_TITLE_LENGTH global MAXIMUM_TITLE_ACTUAL global FND_FILE_SEGMENT_SIZE start_time = time.time() out_f = SegmentedFileWriter(filename_format, FND_FILE_SEGMENT_SIZE) PrintLog.message(u'Writing bigrams: {0:s}'.format(out_f.current_filename)) sortedgram = [ (value, key) for key, value in bigram.iteritems() ] sortedgram.sort() sortedgram.reverse() bigram = {} i = 0 for k, v in sortedgram: out_f.write(v) bigram[v] = chr(i + 128) i += 1 if i >= 128: break while i < 128: out_f.write('zz') bigram['zz'] = chr(i + 128) i += 1 PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) # create pfx matrix and write encoded titles PrintLog.message(u'Sorting titles') start_time = time.time() ####@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@#### unique_articles = {} for article in [ (SearchKey.make_key(translated_title[:MAXIMUM_TITLE_LENGTH]), title) for title in article_index.all_indices() for translated_title in language_processor.translate(title) ]: unique_articles[article] = 1 article_list = sorted(unique_articles.keys()) PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time)) PrintLog.message(u'Writing matrix: {0:s}'.format(out_f.current_filename)) start_time = time.time() index_matrix = {} index_matrix['\0\0\0'] = out_f.tell() previous_bigram_title = '' previous_utf8_title = '' mod_counter = 0 for stripped_title, title in article_list: bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH] (article_number, dummy, restricted, is_redirect) = article_index.get_index(title) if '' == bigram_title and is_redirect: continue utf8_title = title.encode('utf-8') if truncate_title: utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH] else: utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL] offset = out_f.tell() article_index.set_index(title, (article_number, offset, restricted, is_redirect)) key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower() key2 = key3[0:2] + '\0' key1 = key3[0:1] + '\0\0' if key1 not in index_matrix: index_matrix[key1] = offset if key2 not in index_matrix: index_matrix[key2] = offset if key3 not in index_matrix: index_matrix[key3] = offset if 0 == mod_counter & 0x0f: bigram_common_length = 0 utf8_common_length = 0 else: bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title) utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title) mod_counter += 1 previous_bigram_title = bigram_title previous_utf8_title = utf8_title if bigram_common_length > 1: bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:] if utf8_common_length > 1: utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:] out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0') PrintLog.message(u'Final segment: {0:s}'.format(out_f.current_filename)) out_f.close() PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))