Esempio n. 1
0
def output_fnd(filename, article_index, language_processor, truncate_title):
    """create bigram table"""
    global bigram
    global index_matrix
    global MAXIMUM_TITLE_LENGTH
    global MAXIMUM_TITLE_ACTUAL

    PrintLog.message(u'Writing bigrams: {0:s}'.format(filename))
    start_time = time.time()
    out_f = open(filename, 'wb')

    sortedgram = [ (value, key) for key, value in bigram.iteritems() ]
    sortedgram.sort()
    sortedgram.reverse()

    bigram = {}
    i = 0
    for k, v in sortedgram:
        out_f.write(v)
        bigram[v] = chr(i + 128)
        i += 1
        if i >= 128:
            break
    while i < 128:
        out_f.write('zz')
        bigram['zz'] = chr(i + 128)
        i += 1

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    # create pfx matrix and write encoded titles

    #article_list = [strip_accents(k) for k in article_index.keys()]
    #article_list.sort(key = lambda x: strip_accents(x).lower())

    PrintLog.message(u'Sorting titles')
    start_time = time.time()

    article_list = [ (SearchKey.make_key(language_processor.translate(title)), title)
                      for title in article_index.all_indices() ]
    article_list.sort()

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    PrintLog.message(u'Writing matrix: {0:s}'.format(filename))
    start_time = time.time()

    index_matrix = {}
    index_matrix['\0\0\0'] = out_f.tell()

    previous_bigram_title = ''
    previous_utf8_title = ''
    mod_counter = 0

    for stripped_title, title in article_list:

        bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH]
        (article_number, dummy, restricted, is_redirect) = article_index.get_index(title)

        if '' == bigram_title and is_redirect:
            continue

        utf8_title = title.encode('utf-8')
        if truncate_title:
            utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH]
        else:
            utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL]

        offset = out_f.tell()
        article_index.set_index(title, (article_number, offset, restricted, is_redirect))

        key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower()
        key2 = key3[0:2] + '\0'
        key1 = key3[0:1] + '\0\0'
        if key1 not in index_matrix:
            index_matrix[key1] = offset
        if key2 not in index_matrix:
            index_matrix[key2] = offset
        if key3 not in index_matrix:
            index_matrix[key3] = offset

        if 0 == mod_counter & 0x0f:
            bigram_common_length = 0
            utf8_common_length = 0
        else:
            bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title)
            utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title)
        mod_counter += 1

        previous_bigram_title = bigram_title
        previous_utf8_title = utf8_title

        if bigram_common_length > 1:
            bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:]
        if utf8_common_length > 1:
            utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:]

        out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0')

    out_f.close()
    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
Esempio n. 2
0
def output_fnd(filename_format, article_index, language_processor, truncate_title):
    """create bigram table"""
    global bigram
    global index_matrix
    global MAXIMUM_TITLE_LENGTH
    global MAXIMUM_TITLE_ACTUAL
    global FND_FILE_SEGMENT_SIZE

    start_time = time.time()
    out_f = SegmentedFileWriter(filename_format, FND_FILE_SEGMENT_SIZE)
    PrintLog.message(u'Writing bigrams: {0:s}'.format(out_f.current_filename))

    sortedgram = [ (value, key) for key, value in bigram.iteritems() ]
    sortedgram.sort()
    sortedgram.reverse()

    bigram = {}
    i = 0
    for k, v in sortedgram:
        out_f.write(v)
        bigram[v] = chr(i + 128)
        i += 1
        if i >= 128:
            break
    while i < 128:
        out_f.write('zz')
        bigram['zz'] = chr(i + 128)
        i += 1

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    # create pfx matrix and write encoded titles

    PrintLog.message(u'Sorting titles')
    start_time = time.time()

    ####@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@####
    unique_articles = {}
    for article in [ (SearchKey.make_key(translated_title[:MAXIMUM_TITLE_LENGTH]), title)
                     for title in article_index.all_indices()
                     for translated_title in language_processor.translate(title) ]:
        unique_articles[article] = 1

    article_list = sorted(unique_articles.keys())

    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))

    PrintLog.message(u'Writing matrix: {0:s}'.format(out_f.current_filename))
    start_time = time.time()

    index_matrix = {}
    index_matrix['\0\0\0'] = out_f.tell()

    previous_bigram_title = ''
    previous_utf8_title = ''
    mod_counter = 0

    for stripped_title, title in article_list:

        bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH]
        (article_number, dummy, restricted, is_redirect) = article_index.get_index(title)

        if '' == bigram_title and is_redirect:
            continue

        utf8_title = title.encode('utf-8')
        if truncate_title:
            utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH]
        else:
            utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL]

        offset = out_f.tell()
        article_index.set_index(title, (article_number, offset, restricted, is_redirect))

        key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower()
        key2 = key3[0:2] + '\0'
        key1 = key3[0:1] + '\0\0'
        if key1 not in index_matrix:
            index_matrix[key1] = offset
        if key2 not in index_matrix:
            index_matrix[key2] = offset
        if key3 not in index_matrix:
            index_matrix[key3] = offset

        if 0 == mod_counter & 0x0f:
            bigram_common_length = 0
            utf8_common_length = 0
        else:
            bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title)
            utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title)
        mod_counter += 1

        previous_bigram_title = bigram_title
        previous_utf8_title = utf8_title

        if bigram_common_length > 1:
            bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:]
        if utf8_common_length > 1:
            utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:]

        out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0')

    PrintLog.message(u'Final segment: {0:s}'.format(out_f.current_filename))
    out_f.close()
    PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))