Example #1
0
 def __init__(self, ix: index.FileIndex, ix_reader: IndexReader):
     self._ix = ix
     self._reader = ix_reader
     self._pop_dn = []
     for dx in ix_reader.iter_docs():
         self._pop_dn += [(int(dx[1]['count']), dx[0])]
     self._pop_dn.sort(reverse=True)
Example #2
0
    def terms_within(self, fieldname, text, maxdist, prefix=0):
        if not self.has_word_graph(fieldname):
            # This reader doesn't have a graph stored, use the slow method
            return IndexReader.terms_within(self, fieldname, text, maxdist,
                                            prefix=prefix)

        return dawg.within(self._graph, text, k=maxdist, prefix=prefix,
                           address=self._graph.root(fieldname))
Example #3
0
 def lexicon(self, fieldname):
     self._test_field(fieldname)
     # If a fieldcache for the field is already loaded, we already have the
     # values for the field in memory, so just yield them from there
     if self.fieldcache_loaded(fieldname):
         return self._texts_in_fieldcache(fieldname)
     else:
         return IndexReader.lexicon(self, fieldname)
 def lexicon(self, fieldname):
     self._test_field(fieldname)
     # If a fieldcache for the field is already loaded, we already have the
     # values for the field in memory, so just yield them from there
     if self.fieldcache_loaded(fieldname):
         return self._texts_in_fieldcache(fieldname)
     else:
         return IndexReader.lexicon(self, fieldname)
Example #5
0
 def expand_prefix(self, fieldname, prefix):
     self._test_field(fieldname)
     # If a fieldcache for the field is already loaded, we already have the
     # values for the field in memory, so just yield them from there
     if self.fieldcache_loaded(fieldname):
         return self._texts_in_fieldcache(fieldname, prefix)
     else:
         # Call super
         return IndexReader.expand_prefix(self, fieldname, prefix)
Example #6
0
 def expand_prefix(self, fieldname, prefix):
     self._test_field(fieldname)
     # If a fieldcache for the field is already loaded, we already have the
     # values for the field in memory, so just yield them from there
     if self.fieldcache_loaded(fieldname):
         return self._texts_in_fieldcache(fieldname, prefix)
     else:
         # Call super
         return IndexReader.expand_prefix(self, fieldname, prefix)
Example #7
0
    def terms_within(self, fieldname, text, maxdist, prefix=0):
        if not self.has_word_graph(fieldname):
            # This reader doesn't have a graph stored, use the slow method
            return IndexReader.terms_within(self,
                                            fieldname,
                                            text,
                                            maxdist,
                                            prefix=prefix)

        return dawg.within(self._graph,
                           text,
                           k=maxdist,
                           prefix=prefix,
                           address=self._graph.root(fieldname))
Example #8
0
from whoosh.reading import IndexReader
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
from whoosh.lang.porter import stem
import supportingFunctions
import math
import nltk
import os
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

ix = open_dir("indexdir_robust04_full")
IndexReader()
#xx=IndexReader.all_terms('content')
p = ix.reader()
p2 = ix.schema
print((ix.schema))

# all the words in the corpus
bytestrings = list(p.lexicon("content"))
fieldobj = ix.schema["content"]
words = [fieldobj.from_bytes(bs) for bs in p.lexicon("content")]
#print ("the words are :" , words)

# queries is a dictionary with the keys being the query numbers and the elements are the query words
queries = {}
queries_path = '/home/niloo/rb04-queries2'
with open(queries_path, 'r') as q_file: