def spimi(documents,
          res_dir='D:/res/',
          blocks_dir='D:/blocks/',
          encoding='latin-1',
          encode=to_gamma_str,
          block_len=4):
    Path(res_dir).mkdir(parents=True, exist_ok=True)
    Path(blocks_dir).mkdir(parents=True, exist_ok=True)

    counter = 0
    m_dict = dict()
    gc.collect()

    for word, docID in get_words(documents, encoding):
        if psutil.virtual_memory().percent > 60:
            write_block(counter, SortedDict(m_dict), blocks_dir, encoding)
            counter += 1
            m_dict.clear()
            gc.collect()

        if word in m_dict:
            m_dict[word][0] += 1
            if m_dict[word][1][-1] != docID:
                m_dict[word][1].append(docID)
        else:
            m_dict[word] = [1, [docID]]

    if m_dict:
        write_block(counter, SortedDict(m_dict), blocks_dir, encoding)
        counter += 1

    print('unite blocks')
    unite_blocks(counter, blocks_dir, res_dir, encoding, encode, block_len)
    print('after union completed')
def spimi(documents, res_dir='res'):
    Path(res_dir).mkdir(parents=True, exist_ok=True)
    Path('blocks').mkdir(parents=True, exist_ok=True)

    counter = 0
    m_dict = dict()
    print('begin')
    try:
        for word, docID in get_words(documents):
            if psutil.virtual_memory().percent > 40:
                write_block(counter, SortedDict(m_dict))
                counter += 1
                m_dict.clear()
                print(psutil.virtual_memory().percent)

            if word in m_dict:
                m_dict[word][0] += 1
                if m_dict[word][1][-1] != docID:
                    m_dict[word][1].append(docID)
            else:
                m_dict[word] = [1, [docID]]

        if m_dict:
            write_block(counter, SortedDict(m_dict))
            counter += 1

        print('unite blocks')
        unite_blocks(counter, res_dir)
        print('after union completed')
    except MemoryError:
        print(psutil.virtual_memory())
Esempio n. 3
0
def get_reversed_index(documents) -> WordCollectionInfo:
    vocabulary = dict()
    postings = list()
    all_words_counter = 0
    unique_word_id = 0
    for word, doc_id in get_words(documents):
        unique_word_id = add_word(word, unique_word_id, doc_id, vocabulary,
                                  postings)
        all_words_counter += 1
    return WordCollectionInfo(SortedDict(vocabulary), postings, documents,
                              all_words_counter)