class WhooshSearchInterface(BaseSearchInterface): """ A search interface making use of the Whoosh indexing library - and the ifind search components. Set model = 0 for TFIDIF Set model = 1 for BM25 (defaults to b=0.75), set pval to change b. Set model = 2 for PL2 (defaults to c=10.), set pval to change c. """ def __init__(self, whoosh_index_dir, model=2, implicit_or=True, pval=None): super(WhooshSearchInterface, self).__init__() log.debug("Whoosh Index to open: {0}".format(whoosh_index_dir)) self.__index = open_dir(whoosh_index_dir) self.__reader = self.__index.reader() self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir,model=model,implicit_or=implicit_or) if pval: self.__engine.set_model(model, pval) def issue_query(self, query, top=100): """ Allows one to issue a query to the underlying search engine. Takes an ifind Query object. """ query.top = top response = self.__engine.search(query) self._last_query = query self._last_response = response return response def get_document(self, document_id): """ Retrieves a Document object for the given document specified by parameter document_id. """ fields = self.__reader.stored_fields(int(document_id)) title = fields['title'] content = fields['content'] document_num = fields['docid'] document_date = fields['timedate'] document_source = fields['source'] document = Document(id=document_id, title=title, content=content) document.date = document_date document.doc_id = document_num document.source = document_source return document
def __init__(self, whoosh_index_dir, model=2, implicit_or=True, pval=None): super(WhooshSearchInterface, self).__init__() log.debug("Whoosh Index to open: {0}".format(whoosh_index_dir)) self.__index = open_dir(whoosh_index_dir) self.__reader = self.__index.reader() self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir,model=model,implicit_or=implicit_or) if pval: self.__engine.set_model(model, pval)
def __init__(self, whoosh_index_dir, model=2, implicit_or=True, pval=None, frag_type=2, frag_size=2, frag_surround=40, host=None, port=0): super(WhooshSearchInterface, self).__init__() log.debug("Whoosh Index to open: {0}".format(whoosh_index_dir)) self.__index = open_dir(whoosh_index_dir) self.__reader = self.__index.reader() self.__redis_conn = None if host is None: self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir, model=model, implicit_or=implicit_or) else: self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir, model=model, implicit_or=implicit_or, cache='engine', host=host, port=port) # Update (2017-05-02) for snippet fragment tweaking. # SIGIR Study (2017) uses frag_type==1 (2 doesn't give sensible results), surround==40, snippet_sizes==2,0,1,4 self.__engine.snippet_size = frag_size self.__engine.set_fragmenter(frag_type=frag_type, surround=frag_surround) if pval: self.__engine.set_model(model, pval)
from ifind.common.language_model import LanguageModel from compute_snippet_len_gain import make_query, get_words_from_snippet, compute_length, compute_info_gain import sys from ifind.search.engines.whooshtrec import Whooshtrec from ifind.search import Query bm25_search_engine = Whooshtrec( whoosh_index_dir='/Users/david/Workspace/indexes/aquaint_test500_whoosh/', stopwords_file='', model=1, newschema=True) bm25_search_engine.snippet_size = 40 def main(): log_file = sys.argv[1] lm = LanguageModel(file='vocab.in') # Interface... 1 2 3 4 snippet_sizes = [2, 0, 1, 4] snippet_surround = [40, 40, 40, 40] with open(log_file) as f: for s in f: fields = s.strip().split() amtid = fields[3] interface = fields[5] order = fields[6] topic = fields[7]
class WhooshSearchInterface(BaseSearchInterface): """ A search interface making use of the Whoosh indexing library - and the ifind search components. Set model = 0 for TFIDIF Set model = 1 for BM25 (defaults to b=0.75), set pval to change b. Set model = 2 for PL2 (defaults to c=10.), set pval to change c. """ def __init__(self, whoosh_index_dir, model=2, implicit_or=True, pval=None, frag_type=2, frag_size=2, frag_surround=40, host=None, port=0): super(WhooshSearchInterface, self).__init__() log.debug("Whoosh Index to open: {0}".format(whoosh_index_dir)) self.__index = open_dir(whoosh_index_dir) self.__reader = self.__index.reader() self.__redis_conn = None if host is None: self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir, model=model, implicit_or=implicit_or) else: self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir, model=model, implicit_or=implicit_or, cache='engine', host=host, port=port) # Update (2017-05-02) for snippet fragment tweaking. # SIGIR Study (2017) uses frag_type==1 (2 doesn't give sensible results), surround==40, snippet_sizes==2,0,1,4 self.__engine.snippet_size = frag_size self.__engine.set_fragmenter(frag_type=frag_type, surround=frag_surround) if pval: self.__engine.set_model(model, pval) def issue_query(self, query, top=100): """ Allows one to issue a query to the underlying search engine. Takes an ifind Query object. """ query.top = top response = self.__engine.search(query) self._last_query = query self._last_response = response return response def get_document(self, document_id): """ Retrieves a Document object for the given document specified by parameter document_id. """ fields = self.__reader.stored_fields(int(document_id)) title = fields['title'] content = fields['content'] document_num = fields['docid'] document_date = fields['timedate'] document_source = fields['source'] document = Document(id=document_id, title=title, content=content) document.date = document_date document.doc_id = document_num document.source = document_source return document
def main(): bm25 = Whooshtrec( whoosh_index_dir='fullindex/', stopwords_file='', model=1, newschema=True) query = Query('Sea Pirates') query.skip = 1 query.top = 5 bm25.snippet_size = 3 response = bm25.search(query) i = 1 for result in response.results: print i,len(result.summary) #print result.summary #print "--------------" soup = BeautifulSoup(result.summary,'html.parser') text = soup.getText() #print text print "--------------" n = extract_nouns(text) print set(n) print "--------------" sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] #print tokenized_sentences cat_sentences = [] for ts in tokenized_sentences: for w in ts: cat_sentences.append(w) #print cat_sentences tagged = nltk.pos_tag(cat_sentences) nouns = [word for word,pos in tagged if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')] downcased = [x.lower() for x in nouns] joined = " ".join(downcased).encode('utf-8') into_string = str(nouns) print (into_string) #print tokenized_sentences print "--------------" tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.chunk.ne_chunk_sents(tagged_sentences, binary=True) entity_names = [] for tree in chunked_sentences: # Print results per sentence # print extract_entity_names(tree) entity_names.extend(extract_entity_names(tree)) print set(entity_names) i+=1
'pretask/3/', 'search/3/', 'pretask/4/', 'search/4/','endexperiment/', 'logout/' ] suggestion_trie = AutocompleteTrie( min_occurrences=3, suggestion_count=8, include_stopwords=False, stopwords_path=os.path.join(work_dir, "data/stopwords.txt"), vocab_path=os.path.join(work_dir, "data/vocab.txt"), vocab_trie_path=os.path.join(work_dir, "data/vocab_trie.dat")) search_engine = Whooshtrec( whoosh_index_dir=my_whoosh_doc_index_dir, stopwords_file=stopword_file, model=1, newschema=True) search_engine.key_name = 'bm25' search_engine.set_fragmenter(frag_type=2, surround=30) exp_test = ExperimentSetup( workflow=snippet_flow, engine=search_engine, practice_topic='367', topics=['347', '341', '435','408'], rpp=10, practice_interface=1, interface=[1, 2, 3, 4], rotation_type=1,
import sys from ifind.search import Query from ifind.search.engines.whooshtrec import Whooshtrec from whoosh.index import open_dir from whoosh.qparser import QueryParser whoosh_path = sys.argv[1] stopwords_path = sys.argv[2] page = 3 page_len = 10 search_engine = Whooshtrec(whoosh_index_dir=whoosh_path, stopwords_file=stopwords_path, model=1, newschema=True) query = Query('wildlife extinction') query.skip = page query.top = page_len response = search_engine.search(query) for result in response: print '{0} {1}'.format(result.whooshid, result.rank) print response.result_total print response.results_on_page print response.actual_page