Example #1
0
class RTLI:  # Reader, tokenizer, linguistic, indexer
    def __init__(self,
                 tokenizer_mode,
                 file='../content/metadata.csv',
                 stopwords_file="../content/snowball_stopwords_EN.txt",
                 chunksize=10000,
                 queries_path='../content/queries.txt',
                 rank_mode='bm25',
                 docs_limit=50,
                 positional_flag=False):
        self.tokenizer = Tokenizer(tokenizer_mode, stopwords_file)
        self.indexer = Indexer(positional_flag=positional_flag)
        self.ranker = Ranker(queries_path=queries_path,
                             mode=rank_mode,
                             docs_limit=docs_limit)
        self.file = file

        # defines the number of lines to be read at once
        self.chunksize = chunksize
        self.block_number = 0

        # used in bm25 to check each documents length, and the average of all docs
        self.docs_length = {}

        # collection size
        self.collection_size = 0

    # auxiliary function to generate chunks of text to read
    def gen_chunks(self, reader):
        chunk = []
        for i, line in enumerate(reader):
            if (i % self.chunksize == 0 and i > 0):
                yield chunk
                del chunk[:]  # or: chunk = []
            chunk.append(line)
        yield chunk

    # main function of indexing and tokenizing
    def process(self, reset_dirs):

        # optional arg to clear our directories
        if reset_dirs:
            self.indexer.reset_dirs()

        # Clean dirs
        reindex_flag = self.indexer.create_dirs()

        if not reindex_flag:
            # Reading step
            # We passed the reader to here, so we could do reading chunk by chunk
            with open(self.file, newline='', encoding="utf-8") as csvfile:
                reader = csv.DictReader(csvfile)
                for chunk in self.gen_chunks(reader):
                    # Check available memory
                    tokens = []
                    mem = psutil.virtual_memory().available
                    for row in chunk:
                        index = row['cord_uid']
                        # Tokenizer step
                        if row['abstract'] != "":
                            appended_string = row['abstract'] + " " + row[
                                'title']
                            tokens += self.tokenizer.tokenize(
                                appended_string, index)

                            self.docs_length[index] = len(tokens)
                            self.collection_size += 1

                    # SPIMI Approach
                    block_index = self.indexer.index(tokens, index,
                                                     positional_flag)
                    self.indexer.create_block(self.block_number)

                    self.block_number += 1

            self.indexer.updateColSize(self.collection_size)
            tokens = []  # clear out memory from last batch of tokens
            self.indexer.merge_blocks()
            # we shouldnt load the whole array

            # update the info document, useful for when we have already indexed the collection, but needs these params
            self.indexer.write_info(self.collection_size)
            self.indexer.write_docs_len(self.docs_length)

        # Here we start evaluating by reading the several index in files
        #self.indexed_map = self.indexer.getIndexed()

    def rank(self, analyze_table, tokenizer_mode, positional_flag):
        self.ranker.update(self.docs_length, self.collection_size,
                           tokenizer_mode,
                           "../content/snowball_stopwords_EN.txt")
        self.ranker.process_queries(analyze_table=analyze_table,
                                    positional_flag=positional_flag)

    def write_index_file(self):
        self.indexer.write_index_file()