def __init__(self, ix: index.FileIndex, ix_reader: IndexReader): self._ix = ix self._reader = ix_reader self._pop_dn = [] for dx in ix_reader.iter_docs(): self._pop_dn += [(int(dx[1]['count']), dx[0])] self._pop_dn.sort(reverse=True)
def terms_within(self, fieldname, text, maxdist, prefix=0): if not self.has_word_graph(fieldname): # This reader doesn't have a graph stored, use the slow method return IndexReader.terms_within(self, fieldname, text, maxdist, prefix=prefix) return dawg.within(self._graph, text, k=maxdist, prefix=prefix, address=self._graph.root(fieldname))
def lexicon(self, fieldname): self._test_field(fieldname) # If a fieldcache for the field is already loaded, we already have the # values for the field in memory, so just yield them from there if self.fieldcache_loaded(fieldname): return self._texts_in_fieldcache(fieldname) else: return IndexReader.lexicon(self, fieldname)
def expand_prefix(self, fieldname, prefix): self._test_field(fieldname) # If a fieldcache for the field is already loaded, we already have the # values for the field in memory, so just yield them from there if self.fieldcache_loaded(fieldname): return self._texts_in_fieldcache(fieldname, prefix) else: # Call super return IndexReader.expand_prefix(self, fieldname, prefix)
from whoosh.reading import IndexReader from whoosh.index import open_dir from whoosh.qparser import QueryParser from whoosh.lang.porter import stem import supportingFunctions import math import nltk import os import re from nltk.corpus import stopwords nltk.download('stopwords') stop_words = set(stopwords.words('english')) ix = open_dir("indexdir_robust04_full") IndexReader() #xx=IndexReader.all_terms('content') p = ix.reader() p2 = ix.schema print((ix.schema)) # all the words in the corpus bytestrings = list(p.lexicon("content")) fieldobj = ix.schema["content"] words = [fieldobj.from_bytes(bs) for bs in p.lexicon("content")] #print ("the words are :" , words) # queries is a dictionary with the keys being the query numbers and the elements are the query words queries = {} queries_path = '/home/niloo/rb04-queries2' with open(queries_path, 'r') as q_file: