class RTLI: # Reader, tokenizer, linguistic, indexer def __init__(self, tokenizer_mode, file='../content/metadata.csv', stopwords_file="../content/snowball_stopwords_EN.txt", chunksize=10000, queries_path='../content/queries.txt', rank_mode='bm25', docs_limit=50, positional_flag=False): self.tokenizer = Tokenizer(tokenizer_mode, stopwords_file) self.indexer = Indexer(positional_flag=positional_flag) self.ranker = Ranker(queries_path=queries_path, mode=rank_mode, docs_limit=docs_limit) self.file = file # defines the number of lines to be read at once self.chunksize = chunksize self.block_number = 0 # used in bm25 to check each documents length, and the average of all docs self.docs_length = {} # collection size self.collection_size = 0 # auxiliary function to generate chunks of text to read def gen_chunks(self, reader): chunk = [] for i, line in enumerate(reader): if (i % self.chunksize == 0 and i > 0): yield chunk del chunk[:] # or: chunk = [] chunk.append(line) yield chunk # main function of indexing and tokenizing def process(self, reset_dirs): # optional arg to clear our directories if reset_dirs: self.indexer.reset_dirs() # Clean dirs reindex_flag = self.indexer.create_dirs() if not reindex_flag: # Reading step # We passed the reader to here, so we could do reading chunk by chunk with open(self.file, newline='', encoding="utf-8") as csvfile: reader = csv.DictReader(csvfile) for chunk in self.gen_chunks(reader): # Check available memory tokens = [] mem = psutil.virtual_memory().available for row in chunk: index = row['cord_uid'] # Tokenizer step if row['abstract'] != "": appended_string = row['abstract'] + " " + row[ 'title'] tokens += self.tokenizer.tokenize( appended_string, index) self.docs_length[index] = len(tokens) self.collection_size += 1 # SPIMI Approach block_index = self.indexer.index(tokens, index, positional_flag) self.indexer.create_block(self.block_number) self.block_number += 1 self.indexer.updateColSize(self.collection_size) tokens = [] # clear out memory from last batch of tokens self.indexer.merge_blocks() # we shouldnt load the whole array # update the info document, useful for when we have already indexed the collection, but needs these params self.indexer.write_info(self.collection_size) self.indexer.write_docs_len(self.docs_length) # Here we start evaluating by reading the several index in files #self.indexed_map = self.indexer.getIndexed() def rank(self, analyze_table, tokenizer_mode, positional_flag): self.ranker.update(self.docs_length, self.collection_size, tokenizer_mode, "../content/snowball_stopwords_EN.txt") self.ranker.process_queries(analyze_table=analyze_table, positional_flag=positional_flag) def write_index_file(self): self.indexer.write_index_file()