def get_words(self, unit): """ Returns list of word pairs for an unit. """ words = set() # Prepare analyzers # - standard analyzer simply splits words # - stemming extracts stems, to catch things like plurals analyzers = [ StandardAnalyzer(), StemmingAnalyzer(), ] lang_code = unit.translation.language.base_code() # Add per language analyzer if Whoosh has it if has_stemmer(lang_code): analyzers.append(LanguageAnalyzer(lang_code)) # Add ngram analyzer for languages like Chinese or Japanese if unit.translation.language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context for text in unit.get_source_plurals() + [unit.context]: for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update([ token.text for token in analyzer(force_unicode(text)) ]) except (UnicodeDecodeError, IndexError) as error: report_error(error, sys.exc_info()) # Grab all words in the dictionary dictionary = self.filter(project=unit.translation.subproject.project, language=unit.translation.language) if len(words) == 0: # No extracted words, no dictionary dictionary = dictionary.none() else: # Build the query for fetching the words # Can not use __in as we want case insensitive lookup query = Q() for word in words: query |= Q(source__iexact=word) # Filter dictionary dictionary = dictionary.filter(query) return dictionary
def exec_comp(): #text analyzers selected_analyzers = [ StemmingAnalyzer(), SimpleAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), FancyAnalyzer(), NgramAnalyzer(5), KeywordAnalyzer(), LanguageAnalyzer('en') ] #text analyzers sel_ana = [ 'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()', 'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)', 'KeywordAnalyzer()', 'LanguageAnalyzer()' ] #text which will be used for graph and for mrr table i = 0 #counter mrrs = [] #list where MRR values for each SE configuration will be stored #scoring functions scoring_functions = [ scoring.TF_IDF(), scoring.Frequency(), scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) ] scor_func = [' TF_IDF', ' Frequency', ' BM25F'] #ground truth gt1 = pd.read_csv(gt_path, sep='\t') #combinations for every chosen analyzer with every chosen scoring function for x in range(len(selected_analyzers)): for y in range(len(scoring_functions)): i = i + 1 sr_1 = exec_queries( selected_analyzers[x], scoring_functions[y] ) # execute queries for the chosen configuration combination sr_1.to_csv(path_start + "/part_1/" + str(i) + "__.csv", index=False) #save results of the search engine mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1, sr_1))) #calculate MRR mrrs_saving = pd.DataFrame(mrrs) print(mrrs_saving) mrrs_saving.to_csv(path_start + "/part_1/" + mrrs_name, index=False) #store MRR table
def create(self): analyzer = StandardAnalyzer(minsize=1, stoplist=None) | CleanUpFilter() schema = Schema( source=TEXT(stored=True, analyzer=analyzer), target=TEXT(stored=True, analyzer=analyzer), comment=TEXT(stored=True), context=TEXT(stored=True), softcatala=BOOLEAN(stored=True), project=TEXT(stored=True), ) if not os.path.exists(self.dir_name): os.mkdir(self.dir_name) ix = create_in(self.dir_name, schema) self.writer = ix.writer()
def _mk_schema(self, dsinfo): from whoosh import fields as wf from whoosh.analysis import StandardAnalyzer # TODO support some customizable mapping to homogenize some metadata fields # onto a given set of index keys self.schema = wf.Schema( id=wf.ID, path=wf.ID(stored=True), type=wf.ID(stored=True), parentds=wf.ID(stored=True), meta=wf.TEXT( stored=False, analyzer=StandardAnalyzer(minsize=2)) )
def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize ): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={ 'name_B': float( tool_name_boost ), 'section_B': float( tool_section_boost ), 'description_B': float( tool_description_boost ), 'labels_B': float( tool_label_boost ), 'stub_B': float( tool_stub_boost ), 'help_B': float( tool_help_boost ) } ) ) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema ) # Hyphens are wildcards in Whoosh causing bad things if q.find( '-' ) != -1: q = (' ').join( [ token.text for token in self.rex( to_unicode( q ) ) ] ) # Perform tool search with ngrams if set to true in the config file if ( tool_enable_ngram_search is True or tool_enable_ngram_search == "True" ): hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int( tool_ngram_minsize ), maxsize=int( tool_ngram_maxsize ) ) ngrams = [ token.text for token in token_analyzer( q ) ] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = searcher.search( parser.parse( '*' + query + '*' ), limit=float( tool_search_limit ) ) for i, curr_hit in enumerate( curr_hits ): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit[ 'id' ] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[ prev_hit ] = curr_hits.score(i) + hits_with_score[ prev_hit ] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[ curr_hit[ 'id' ] ] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted( hits_with_score.items(), key=lambda x: x[1], reverse=True ) # Return the tool ids return [ item[0] for item in hits_with_score[ 0:int( tool_search_limit ) ] ] else: # Perform the search hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) ) return [ hit[ 'id' ] for hit in hits ]
def tokenize(text, stemming=True, stoplist=None): # kstemmer = Stemmer() translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) # map punctuation to space text = text.translate(translator) text = text.lower() text = text.strip() table = str.maketrans({key: None for key in string.punctuation}) text = text.translate(table) if stemming: analyzer = StemmingAnalyzer(stoplist=stoplist, minsize=2, stemfn=kstemmer.stem) else: analyzer = StandardAnalyzer(stoplist=stoplist, minsize=2) tokens = [token.text for token in analyzer(text)] tokens = [word for word in tokens if not contains_digits(word)] return tokens
def create(self, in_memory=False): tokenizer_pattern = rcompile(r"(\w|·)+(\.?(\w|·)+)*") # Includes l·l analyzer = StandardAnalyzer(minsize=1, stoplist=None, expression=tokenizer_pattern) schema = Schema(verb_form=TEXT(stored=True, sortable=True, analyzer=analyzer), index_letter=TEXT(stored=True, analyzer=analyzer), file_path=TEXT(stored=True, sortable=True)) if os.path.exists(self.dir_name): shutil.rmtree(self.dir_name) os.makedirs(self.dir_name) ix = create_in(self.dir_name, schema) self.writer = ix.writer() return ix
def create_schema_with_stopwords(directory): schema = Schema(FileName=TEXT(stored=True), FilePath=TEXT(stored=True), Title=TEXT(stored=True), Content=TEXT(stored=True, analyzer=StandardAnalyzer(stoplist=None)), Size=TEXT(stored=True), LastModified=TEXT(stored=True), LastAccessed=TEXT(stored=True), CreationTime=TEXT(stored=True), Mode=TEXT(stored=True), text=TEXT(spelling=True)) create_non_existent_directory(directory) ix = create_in(directory, schema) global writer writer = ix.writer() return
def __init__(self, analyzer=None, phrase=True, vector=None, stored=False, field_boost=1.0, multitoken_query="first"): """ :param analyzer: The analysis.Analyzer to use to index the field contents. See the analysis module for more information. If you omit this argument, the field uses analysis.StandardAnalyzer. :param phrase: Whether the store positional information to allow phrase searching. :param vector: A :class:`whoosh.formats.Format` object to use to store term vectors, or ``True`` to store vectors using the same format as the inverted index, or ``None`` or ``False`` to not store vectors. By default, fields do not store term vectors. :param stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. """ ana = analyzer or StandardAnalyzer() if phrase: formatclass = Positions else: formatclass = Frequency self.format = formatclass(analyzer=ana, field_boost=field_boost) if vector: if type(vector) is type: vector = vector(ana) elif isinstance(vector, Format): pass else: vector = formatclass(ana) else: vector = None self.vector = vector self.multitoken_query = multitoken_query self.scorable = True self.stored = stored
def tokenize_corpus(self, corpus_folder, stopwords, out_path, remove_digits=False, minsize=3): """tokenize corpus into list of lists and return list of doc ids""" corpus = self.read_corpus(corpus_folder) # set tokenizer tokenizer = StandardAnalyzer(stoplist=stopwords, minsize=minsize) # tokenize corpus as list of lists print('tokenizing corpus ...') docs = {} for docno, doc in corpus: # tokenize doc doc_tokens = self.tokenize_doc(doc, tokenizer, remove_digits) docs[docno] = doc_tokens print('corpus tokenized!') print("store processed data") with open(out_path + '/docs.json', 'w') as file_docs: json.dump(docs, file_docs) return docs
def index(root_folder, files, output, language): # schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True)) my_analyzer = StandardAnalyzer(stoplist=getStopWords()) schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, vector=True, analyzer=my_analyzer)) ix = create_in(output, schema) total = len(files) i = 1 for f in files: print i, "/", total, ":", f text = get_text_from_file(f) writer = ix.writer() writer.add_document(title=unicode(f.replace(root_folder, ""), "utf-8"), path=unicode(f.replace(root_folder, ""), "utf-8"), content=unicode(text, "utf-8")) writer.commit() i = i + 1
def main(): option = True while option: print(""" 1. Create Index. 2. Query Index. 3. Exit """) option = input("Please select an option...!") if option == "1": file_content_doc1 = open("rural.txt").read() file_content_doc2 = open("science.txt").read() sent_tokenize_list1 = sent_tokenize(file_content_doc1, language='english') sent_tokenize_list2 = sent_tokenize(file_content_doc2, language='english') if not os.path.exists("index_task2"): os.mkdir("index_task2") schema = Schema(full_text=TEXT(phrase=True, stored=True, analyzer=StandardAnalyzer( stoplist=None))) ix = create_in("index_task2", schema) writer = ix.writer() for sentence in sent_tokenize_list1: writer.add_document(full_text=sentence) for sentence in sent_tokenize_list2: writer.add_document(full_text=sentence) writer.commit() print("\n\n Index created with various features as its fields") elif option == "2": ix = index.open_dir("index_task2") with ix.searcher() as searcher: og = qparser.OrGroup.factory(0.5) q = input("\n Insert a query...!") query = QueryParser("full_text", ix.schema, group=og).parse(q) results = searcher.search(query) for i, hit in enumerate(results): print(results.score(i), hit["full_text"], sep=":") print("\n")
def create(self, in_memory=False): analyzer = StandardAnalyzer(minsize=1, stoplist=None) | CleanUpFilter() schema = Schema(source=TEXT(stored=True, analyzer=analyzer), target=TEXT(stored=True, analyzer=analyzer), comment=STORED, context=STORED, softcatala=BOOLEAN, project=ID(stored=True)) if in_memory: st = RamStorage() ix = st.create_index(schema) else: if not os.path.exists(self.dir_name): os.mkdir(self.dir_name) ix = create_in(self.dir_name, schema) self.writer = ix.writer() return ix
def get_dictionary(request, unit_id): ''' Lists words from dictionary for current translation. ''' unit = get_object_or_404(Unit, pk=int(unit_id)) unit.check_acl(request) words = set() # Prepare analyzers # - standard analyzer simply splits words # - stemming extracts stems, to catch things like plurals analyzers = (StandardAnalyzer(), StemmingAnalyzer()) # Extract words from all plurals and from context for text in unit.get_source_plurals() + [unit.context]: for analyzer in analyzers: words = words.union([token.text for token in analyzer(text)]) # Grab all words in the dictionary dictionary = Dictionary.objects.filter( project=unit.translation.subproject.project, language=unit.translation.language) if len(words) == 0: # No extracted words, no dictionary dictionary = dictionary.none() else: # Build the query (can not use __in as we want case insensitive lookup) query = Q() for word in words: query |= Q(source__iexact=word) # Filter dictionary dictionary = dictionary.filter(query) return render_to_response( 'js/dictionary.html', RequestContext(request, { 'dictionary': dictionary, 'translation': unit.translation, }))
def indexing(corpus, ram_limit=1024, d_test=True, stemmed=True): start_time = time.time() global stemming stemming = stemmed if stemming: analyzer = StemmingAnalyzer() else: analyzer = StandardAnalyzer() schema = Schema(doc_id=NUMERIC(stored=True), date=TEXT(analyzer=SpaceSeparatedTokenizer()), headline=TEXT(field_boost=1.5, analyzer=analyzer), dateline=TEXT(analyzer=analyzer), byline=TEXT(analyzer=analyzer), content=TEXT(analyzer=analyzer)) index_dir = os.path.join("indexes", "docs") # Clear existing indexes/docs folder and make new one if os.path.exists(index_dir): shutil.rmtree(index_dir) os.makedirs(index_dir) # Create index in indexes/docs folder ix = create_in(index_dir, schema) writer = ix.writer(limitmb=ram_limit) traverse_folders(writer, corpus, d_test=d_test) writer.commit() end_time = time.time() # Traverses all files in the indexes/docs folder to calculate disk space taken up by the index space = 0 for subdir, dirs, files in os.walk(index_dir): space += sum( os.stat(os.path.join(index_dir, file)).st_size for file in files) return ix, end_time - start_time, space
def _search_ngrams(self, cleaned_query: str, tool_ngram_minsize: CanConvertToInt, tool_ngram_maxsize: CanConvertToInt, tool_search_limit: CanConvertToFloat) -> List[str]: """ Break tokens into ngrams and search on those instead. This should make searching more resistant to typos and unfinished words. See docs at https://whoosh.readthedocs.io/en/latest/ngrams.html """ hits_with_score: Dict[str, float] = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int(tool_ngram_minsize), maxsize=int(tool_ngram_maxsize)) ngrams = [token.text for token in token_analyzer(cleaned_query)] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = self.searcher.search(self.parser.parse(f"*{query}*"), limit=float(tool_search_limit)) for i, curr_hit in enumerate(curr_hits): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit['id'] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[prev_hit] = curr_hits.score( i) + hits_with_score[prev_hit] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[curr_hit['id']] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score_list: List[Tuple[str, float]] = sorted( hits_with_score.items(), key=lambda x: x[1], reverse=True) # Return the tool ids return [ item[0] for item in hits_with_score_list[0:int(tool_search_limit)] ]
def __init__(self, analyzer = None, phrase = True, vector = None, stored = False, field_boost = 1.0): """ :stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. :phrase: Whether the store positional information to allow phrase searching. :analyzer: The analysis.Analyzer to use to index the field contents. See the analysis module for more information. If you omit this argument, the field uses analysis.StandardAnalyzer. """ ana = analyzer or StandardAnalyzer() if phrase: formatclass = Positions else: formatclass = Frequency self.format = formatclass(analyzer = ana, field_boost = field_boost) self.vector = vector self.scorable = True self.stored = stored
def create_table(index_dir, *, overwrite=False): analyzer = StandardAnalyzer() | CharsetFilter(accent_map) schema = Schema(label=TEXT(stored=True, analyzer=analyzer, lang='fr'), rome=TEXT(stored=True, sortable=True), source=KEYWORD(stored=True, sortable=True), slug=STORED) if not os.path.exists(index_dir): os.mkdir(index_dir) elif exists_in(index_dir): if not overwrite: logger.critical( 'An index already exists in %s; overwrite flag not set; abandonning', index_dir) raise RuntimeError('Index already exists') logger.warning('Index already found, deleting %s to start anew', index_dir) shutil.rmtree(index_dir, ignore_errors=True, onerror=None) os.mkdir(index_dir) logger.info('Whoosh index %s ready for use', index_dir) create_in(index_dir, schema) return index_dir
class MySchema(SchemaClass): ''' Definition of Schema for indexing documents. ''' url = ID(stored=True, unique=True) url_len = NUMERIC(stored=True, default=0) url_txt = TEXT(field_boost=1.9, stored=True) title_page = TEXT(analyzer=StandardAnalyzer(), field_boost=2.2, stored=True) content = TEXT(analyzer=StandardAnalyzer(), stored=True, phrase=True, sortable=True) # not storing content in the index date_created = DATETIME( stored=True) # maybe we prefer storing date as an ID date_updated = DATETIME(stored=True) h1 = TEXT(analyzer=StandardAnalyzer(), field_boost=1.75, stored=True) h2 = TEXT(analyzer=StandardAnalyzer(), field_boost=1.5, stored=True) h3 = TEXT(analyzer=StandardAnalyzer(), field_boost=1.25, stored=True) h4 = TEXT(analyzer=StandardAnalyzer(), field_boost=1.10, stored=True) rank = NUMERIC(stored=True, default=0)
#----------------- # Index Schema #----------------- class _AccentFilter(Filter): def __call__(self, tokens): for t in tokens: t.text = normalize_token(t.text) yield t _stopwords = frozenset(('a', 'an')) _analyzer = (StandardAnalyzer(stoplist=_stopwords) | _AccentFilter()) _schema = Schema( content=TEXT(stored=True, spelling=True, analyzer=_analyzer), data=STORED, # tuple (label, path, prio, sortkey) itemtype=ID, asfilter=IDLIST) _schema['content'].scorable = False #----------------- # Maker #----------------- class Maker(object): def __init__(self, index_dir): if os.path.exists(index_dir) and os.path.isfile(index_dir):
tbl = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P')) # Create function with which to strip punct from unicode def remove_punctuation(unicode_text): return unicode_text.translate(tbl) # Make the index folder if not os.path.exists("index_for_sample_files"): os.mkdir("index_for_sample_files") # Specify a list of paths that contain all of the texts we wish to index text_dirs = ["sample_text_collection"] # Identify the schema we'll use when creating index schema = Schema( filename=TEXT(stored=True), path=TEXT(stored=True), author=TEXT(stored=True), short_title=TEXT(stored=True), full_text=TEXT( stored=True,phrase=True,analyzer=StandardAnalyzer(stoplist=None) ) ) # Create the index using the schema defined above ix = create_in("index_for_sample_files", schema) writer = ix.writer() for i in text_dirs: for j in glob.glob(i + "/*.txt"): with codecs.open(j,"r","utf-8") as raw_file: cleaner_file = remove_punctuation( raw_file.read().replace("\r","").replace("\n"," ") ) # Grab filename, then use that to grab all metadata. NB: Unix users should change the following line to j.split("/")[-1] filename = j.split("\\")[-1] print "indexing file: ", filename
from whoosh.index import create_in, open_dir, EmptyIndexError from whoosh.fields import * from whoosh.qparser import QueryParser, OrGroup from whoosh.analysis import RegexTokenizer, LowercaseFilter, StandardAnalyzer import inflect from globals import * ana = StandardAnalyzer(expression=re.compile(r"[\w\-+]+(\.?[\w\-]+)*", re.UNICODE), stoplist=None, minsize=0) schema = Schema(description=TEXT(stored=True), path=ID(stored=True), lang=TEXT(analyzer=ana, multitoken_query="phrase")) from curseshelpers import * class Searcher: ''' Our search engine class! I, for one, welcome our Googly overlords. ''' def __init__(self, indexdir): # Open the index if it exists try: self.ix = open_dir(indexdir) except EmptyIndexError: self.ix = create_in(indexdir, schema) self.indexdir = indexdir def get_writer(self): self.writer = self.ix.writer()
from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import sys, glob, os, unicodedata from django.conf import settings from bs4 import BeautifulSoup stopset = set(stopwords.words('english')) if not os.path.exists('whoosh'): os.mkdir('whoosh') schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), body=TEXT(stored=True, phrase=True, analyzer=StandardAnalyzer(stoplist=None))) ix = create_in('whoosh', schema) writer = ix.writer() for url in os.listdir('articles/static/'): for html_file in os.listdir('articles/static/' + url): filename = html_file path = 'articles/static/' + url + '/' + html_file with open(path, 'r') as content_file: soup = BeautifulSoup(content_file, "html.parser") title = "" if soup.title is not None: title = soup.title.string
try: from whoosh.index import _CURRENT_TOC_VERSION as whoosh_ix_ver except ImportError: from whoosh.filedb.fileindex import _INDEX_VERSION as whoosh_ix_ver from stemming import stemArabic def stemfn(word): return stemArabic(stem(word)) # word_re = ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]" analyzer = StandardAnalyzer( expression= ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+(?:\.?[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+)*" ) | StemFilter(stemfn) from whoosh.qparser import FieldAliasPlugin from whooshSymbolicQParser import MultifieldSQParser class ExcerptFormatter(object): def __init__(self, between="..."): self.between = between def _format_fragment(self, text, fragment): output = [] index = fragment.startchar for t in fragment.matches:
from whoosh.qparser import * from whoosh.fields import Schema, TEXT, KEYWORD, NUMERIC from whoosh.analysis import StandardAnalyzer from whoosh import index import os.path # year,author,title,abstract are 字段,每隔字段对应索引查找标准文件的一部分信息 schema = Schema(year=NUMERIC(stored=True), author=TEXT(analyzer=StandardAnalyzer(stoplist=None), stored=True), title=TEXT(analyzer=StandardAnalyzer(stoplist=None), stored=True), abstract=TEXT(analyzer=StandardAnalyzer(stoplist=None), stored=True), body=TEXT(analyzer=StandardAnalyzer(stoplist=None)), subject=KEYWORD(commas=True, scorable=True), keywords=KEYWORD(commas=True, scorable=True)) # to create an index in a dictionary if there is none if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", schema) # open an existing index object ix = index.open_dir("indexdir") # create a writer object to add documents to the index writer = ix.writer() # now we can add documents to the index abstract1 = (
def all_stop_words( lst): analyzer = StandardAnalyzer() for t in analyzer( unicode(lst)): if not t.stopped: return False return True
from solvertools.wordlist import WORDS from solvertools.normalize import slugify, sanitize from solvertools.util import data_path from whoosh.index import open_dir from whoosh.analysis import StandardAnalyzer from whoosh import qparser from operator import itemgetter from collections import defaultdict from .conceptnet_numberbatch import load_numberbatch, get_vector, similar_to_term import re INDEX = None QUERY_PARSER = None NUMBERBATCH = None ANALYZER = StandardAnalyzer() def simple_parser(fieldname, schema, group, **kwargs): """ Returns a QueryParser configured to support only +, -, and phrase syntax. Modified from Whoosh's SimpleParser to accept a custom 'group' argument. """ pins = [ qparser.plugins.WhitespacePlugin, qparser.plugins.PlusMinusPlugin, qparser.plugins.PhrasePlugin ] orgroup = qparser.syntax.OrGroup
from whoosh.analysis import Filter, LowercaseFilter, StandardAnalyzer, \ NgramFilter from whoosh.analysis.tokenizers import IDTokenizer from whoosh.fields import NUMERIC, STORED, SchemaClass, TEXT class LodashFilter(Filter): def __call__(self, tokens): for t in tokens: t.text = t.text.replace('_', '') yield t simple_ana = IDTokenizer() | LowercaseFilter() | LodashFilter() custom_ana = StandardAnalyzer(stoplist=None) | LodashFilter() # | NgramFilter(minsize=2, maxsize=5, at='start') # The sort problems with NgramFilter: less relevant artefacts will be first class IndexSchema(SchemaClass): filename = TEXT(stored=True, analyzer=simple_ana) symbol = TEXT(stored=True, analyzer=custom_ana) module = TEXT(stored=True, analyzer=simple_ana) location = STORED() kind = STORED() sort = NUMERIC(sortable=True)
class WhooshConstants(): index_dir = configuration.get('whoosh_index_dir') tokenized_analyzer = StandardAnalyzer(stoplist=None) normalized_analyzer = IDTokenizer() | SubstitutionFilter( r"[\s/,_'-]", "") | LowercaseFilter() stem_analyzer = StemmingAnalyzer(r"[\s/,_'-]", gaps=True, stoplist=None)
def main(): file_content_doc1 = open("rural_min.txt").read() file_content_doc2 = open("science_min.txt").read() option = True while option: print(""" 1. Create Index. 2. Query Index. 3. Exit """) option = input("Please select an option...!") if option == "1": sent_tokenize_list1 = sent_tokenize(file_content_doc1, language='english') sent_tokenize_list2 = sent_tokenize(file_content_doc2, language='english') if not os.path.exists("index_task3_min"): os.mkdir("index_task3_min") my_analyzer = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | Lemmatizer() pos_tagger = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | PosTagger() wordnetsyn1 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets() wordnetsyn2 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets1() wordnetsyn3 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets2() wordnetsyn4 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets3() schema = Schema(id=ID(stored=True, unique=True), standard=TEXT(stored=True, analyzer=StandardAnalyzer()), stem_text=TEXT(stored=True, analyzer=StemmingAnalyzer()), lemma=TEXT(stored=True, analyzer=my_analyzer), pos_text=TEXT(stored=True, analyzer=pos_tagger), hypernym=TEXT(stored=True, analyzer=wordnetsyn1), hyponym=TEXT(stored=True, analyzer=wordnetsyn2), holonym=TEXT(stored=True, analyzer=wordnetsyn3), meronyms=TEXT(stored=True, analyzer=wordnetsyn4), dependency=TEXT(analyzer=DependencyParser())) ix = index.create_in("index_task3_min", schema) writer = ix.writer() for sentence in sent_tokenize_list1: writer.add_document(standard=sentence, stem_text=sentence, lemma=sentence, pos_text=sentence, hypernym=sentence, hyponym=sentence, meronyms=sentence, holonym=sentence, dependency=sentence) for sentence in sent_tokenize_list2: writer.add_document(standard=sentence, stem_text=sentence, lemma=sentence, pos_text=sentence, hypernym=sentence, hyponym=sentence, meronyms=sentence, holonym=sentence, dependency=sentence) writer.commit() print_index_details(ix) print("\n\n Index created with various features as its fields") elif option == "2": ix = index.open_dir("index_task3") with ix.searcher(weighting=whoosh.scoring.BM25F()) as searcher: og = qparser.OrGroup.factory(0.5) q = input("\n Insert a query...!") query_text = MultifieldParser([ "standard", "stem_text", "lemma", "pos_text", "hyponym", "meronyms", "hypernym", "holonym" ], schema=ix.schema, group=og).parse(q) results = searcher.search(query_text, limit=10) for i, hit in enumerate(results): print(results.score(i), hit["standard"], sep=":") print("\n") elif option == "3": print("\n Goodbye") sys.exit(0) option = None else: print("\n Not valid choice try again...!")