class WhooshSearchInterface(BaseSearchInterface): """ A search interface making use of the Whoosh indexing library - and the ifind search components. Set model = 0 for TFIDIF Set model = 1 for BM25 (defaults to b=0.75), set pval to change b. Set model = 2 for PL2 (defaults to c=10.), set pval to change c. """ def __init__(self, whoosh_index_dir, model=2, implicit_or=True, pval=None, frag_type=2, frag_size=2, frag_surround=40, host=None, port=0): super(WhooshSearchInterface, self).__init__() log.debug("Whoosh Index to open: {0}".format(whoosh_index_dir)) self.__index = open_dir(whoosh_index_dir) self.__reader = self.__index.reader() self.__redis_conn = None if host is None: self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir, model=model, implicit_or=implicit_or) else: self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir, model=model, implicit_or=implicit_or, cache='engine', host=host, port=port) # Update (2017-05-02) for snippet fragment tweaking. # SIGIR Study (2017) uses frag_type==1 (2 doesn't give sensible results), surround==40, snippet_sizes==2,0,1,4 self.__engine.snippet_size = frag_size self.__engine.set_fragmenter(frag_type=frag_type, surround=frag_surround) if pval: self.__engine.set_model(model, pval) def issue_query(self, query, top=100): """ Allows one to issue a query to the underlying search engine. Takes an ifind Query object. """ query.top = top response = self.__engine.search(query) self._last_query = query self._last_response = response return response def get_document(self, document_id): """ Retrieves a Document object for the given document specified by parameter document_id. """ fields = self.__reader.stored_fields(int(document_id)) title = fields['title'] content = fields['content'] document_num = fields['docid'] document_date = fields['timedate'] document_source = fields['source'] document = Document(id=document_id, title=title, content=content) document.date = document_date document.doc_id = document_num document.source = document_source return document
suggestion_trie = AutocompleteTrie( min_occurrences=3, suggestion_count=8, include_stopwords=False, stopwords_path=os.path.join(work_dir, "data/stopwords.txt"), vocab_path=os.path.join(work_dir, "data/vocab.txt"), vocab_trie_path=os.path.join(work_dir, "data/vocab_trie.dat")) search_engine = Whooshtrec( whoosh_index_dir=my_whoosh_doc_index_dir, stopwords_file=stopword_file, model=1, newschema=True) search_engine.key_name = 'bm25' search_engine.set_fragmenter(frag_type=2, surround=30) exp_test = ExperimentSetup( workflow=snippet_flow, engine=search_engine, practice_topic='367', topics=['347', '341', '435','408'], rpp=10, practice_interface=1, interface=[1, 2, 3, 4], rotation_type=1, description='standard condition bm25 test', trie=suggestion_trie, autocomplete=True, timeout=[150,600,600,600, 600]) # 300s = 5min; 600s = 10min; 1200s = 20min