Python Indexer.create_block Examples

Programming Language: Python

Namespace/Package Name: indexer

Class/Type: Indexer

Method/Function: create_block

Examples at hotexamples.com: 1

Python Indexer.create_block - 1 examples found. These are the top rated real world Python examples of indexer.Indexer.create_block extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_new_doc(30)

Indexer(30)

create_index(6)

create_unigram_index(3)

calculate_idf(3)

LoadIndexes(3)

close(3)

dump(3)

coords_to_indices(2)

indices_to_coords(2)

calculationSummerize(2)

add_idf_to_dictionary(2)

add_document(2)

LoadDict(2)

fix_inverted_index(2)

finish(2)

evaluate_input(1)

execute(1)

create_save_indexer_with_relevant_docs(1)

entities_and_small_big(1)

directory(1)

delete_dict_after_saving(1)

create_indexer(1)

create_dirs(1)

create_bulk_index_string(1)

finish_index(1)

CreatInvertedIndex(1)

finish_indexing(1)

get_num_spatial_nodes(1)

tokenize(1)

set_idx_fields(1)

process(1)

keys(1)

isStopword(1)

ignore_extensions(1)

get__lda__(1)

fit(1)

getStemmed(1)

getOr(1)

getAnd(1)

get(1)

generate_local_index(1)

create_block(1)

generate_global_index(1)

compute_tf(1)

createIndex(1)

add_square_Wij(1)

bp_index(1)

batch_get_feat_stacked(1)

after_indexing(1)

Example #1

Show file

File: main.py Project: TomasCostaK/Search_Engine_SPIMI

class RTLI:  # Reader, tokenizer, linguistic, indexer
    def __init__(self,
                 tokenizer_mode,
                 file='../content/metadata.csv',
                 stopwords_file="../content/snowball_stopwords_EN.txt",
                 chunksize=10000,
                 queries_path='../content/queries.txt',
                 rank_mode='bm25',
                 docs_limit=50,
                 positional_flag=False):
        self.tokenizer = Tokenizer(tokenizer_mode, stopwords_file)
        self.indexer = Indexer(positional_flag=positional_flag)
        self.ranker = Ranker(queries_path=queries_path,
                             mode=rank_mode,
                             docs_limit=docs_limit)
        self.file = file

        # defines the number of lines to be read at once
        self.chunksize = chunksize
        self.block_number = 0

        # used in bm25 to check each documents length, and the average of all docs
        self.docs_length = {}

        # collection size
        self.collection_size = 0

    # auxiliary function to generate chunks of text to read
    def gen_chunks(self, reader):
        chunk = []
        for i, line in enumerate(reader):
            if (i % self.chunksize == 0 and i > 0):
                yield chunk
                del chunk[:]  # or: chunk = []
            chunk.append(line)
        yield chunk

    # main function of indexing and tokenizing
    def process(self, reset_dirs):

        # optional arg to clear our directories
        if reset_dirs:
            self.indexer.reset_dirs()

        # Clean dirs
        reindex_flag = self.indexer.create_dirs()

        if not reindex_flag:
            # Reading step
            # We passed the reader to here, so we could do reading chunk by chunk
            with open(self.file, newline='', encoding="utf-8") as csvfile:
                reader = csv.DictReader(csvfile)
                for chunk in self.gen_chunks(reader):
                    # Check available memory
                    tokens = []
                    mem = psutil.virtual_memory().available
                    for row in chunk:
                        index = row['cord_uid']
                        # Tokenizer step
                        if row['abstract'] != "":
                            appended_string = row['abstract'] + " " + row[
                                'title']
                            tokens += self.tokenizer.tokenize(
                                appended_string, index)

                            self.docs_length[index] = len(tokens)
                            self.collection_size += 1

                    # SPIMI Approach
                    block_index = self.indexer.index(tokens, index,
                                                     positional_flag)
                    self.indexer.create_block(self.block_number)

                    self.block_number += 1

            self.indexer.updateColSize(self.collection_size)
            tokens = []  # clear out memory from last batch of tokens
            self.indexer.merge_blocks()
            # we shouldnt load the whole array

            # update the info document, useful for when we have already indexed the collection, but needs these params
            self.indexer.write_info(self.collection_size)
            self.indexer.write_docs_len(self.docs_length)

        # Here we start evaluating by reading the several index in files
        #self.indexed_map = self.indexer.getIndexed()

    def rank(self, analyze_table, tokenizer_mode, positional_flag):
        self.ranker.update(self.docs_length, self.collection_size,
                           tokenizer_mode,
                           "../content/snowball_stopwords_EN.txt")
        self.ranker.process_queries(analyze_table=analyze_table,
                                    positional_flag=positional_flag)

    def write_index_file(self):
        self.indexer.write_index_file()