def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields))
def __build_whoosh_index(self, schema_dir): fields = dict( node=TEXT(), fullname=TEXT(stored=True), path=TEXT(), type=NGRAM(minsize=4), study=NGRAM(field_boost=10.0), name=NGRAMWORDS(minsize=3, field_boost=3.0), metadata=NGRAMWORDS(minsize=3), ) schema = Schema(**fields) self.ix = create_in(schema_dir, schema) with self.ix.writer(procs=2, multisegment=True, limitmb=512) as writer: for key, value in self._tree_dict.items(): writer.add_document(node=key.replace('\\', ' ').replace('_', ' '), path=value.get('conceptPath'), fullname=key, type=value.get('type'), study=str(value.get('studyId')), name=str(value.get('name')), metadata=str(value.get('metadata')))
def _to_whoosh_field(self, field, field_name=None): # If the field is AutocompleteField or has partial_match field, treat it as auto complete field if isinstance(field, AutocompleteField) or \ (hasattr(field, 'partial_match') and field.partial_match): whoosh_field = NGRAMWORDS(stored=False, minsize=self.ngram_length[0], maxsize=self.ngram_length[1], queryor=True) else: # TODO other types of fields https://whoosh.readthedocs.io/en/latest/api/fields.htm whoosh_field = TEXT( stored=False, field_boost=get_boost(field), lang=self.language, analyzer=self.analyzer, ) if not field_name: field_name = _get_field_mapping(field) return field_name, whoosh_field
def _setup(self): self._redis = getattr(self, '_redis', None) if not self._redis: self._redis = redis( ) # XXX test cases won't get correctly unpicked because of this self.schema = Schema(content=NGRAMWORDS(stored=False)) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) self.schema.add('sha1', ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self.objects = self.xml_dict('objects') self.parts = self.json_dict('parts') self.storage = FileStorage(os.path.join(self._dir, self._name)) try: self.index = self.storage.open_index(schema=self.schema) except BaseException as ex: log.warn(ex) self.storage.create() self.index = self.storage.create_index(self.schema) self._reindex()
class Indexer: schema = Schema( tagSubject=ID(stored=True), #tagPrefLabel=TEXT(stored=True), tagPrefLabel=NGRAMWORDS(minsize=2, maxsize=10, stored=True, field_boost=1.0, tokenizer=None, at='start', queryor=False, sortable=False), termPrefLabel=NGRAMWORDS(minsize=2, maxsize=10, stored=True, field_boost=1.0, tokenizer=None, at='start', queryor=False, sortable=False), termAltLabel=NGRAMWORDS(minsize=2, maxsize=10, stored=True, field_boost=1.0, tokenizer=None, at='start', queryor=False, sortable=False), termBroader=NGRAMWORDS(minsize=2, maxsize=10, stored=True, field_boost=1.0, tokenizer=None, at='start', queryor=False, sortable=False), termNarrower=NGRAMWORDS(minsize=2, maxsize=10, stored=True, field_boost=1.0, tokenizer=None, at='start', queryor=False, sortable=False), tagScopeNote=TEXT(stored=True), spellingEN=TEXT(stored=True, spelling=True), spellingDE=TEXT(stored=True, spelling=True)) __writer = None wordSetEN = set() wordSetDE = set() def __init__(self, rdfGraph): if rdfGraph is None: return self.createNewIndex() count = 0 for subject, predicate, obj in rdfGraph.graph: if rdfGraph.isInKeyScheme(subject) or rdfGraph.isInTagScheme( subject): if predicate == SKOS.prefLabel: count += 1 print str(count) + ': Indexing tagPrefLabel: ' + str(obj) label = utils.wsWord(obj) lit = Literal(label, obj.language) self.addTagPrefLabel(subject, lit) elif predicate == SKOS.scopeNote: count += 1 print str(count) + ': Indexing tagScopeNote: ' + str(obj) self.addTagScopeNote(subject, obj) elif rdfGraph.isInTermScheme(subject): tagSubjectList = self.getTagsOfRelTerm(rdfGraph, subject) if predicate == SKOS.prefLabel: count += 1 lang = obj.language if lang == 'en' or lang == 'de': print str(count) + ': Indexing termPrefLabel: ' + str( obj) self.addTermPrefLabel(tagSubjectList, obj) if predicate == SKOS.altLabel: count += 1 lang = obj.language if lang == 'en' or lang == 'de': print str(count) + ': Indexing termAltLabel: ' + str( obj) self.addTermAltLabel(tagSubjectList, obj) if predicate == SKOS.broader: count += 1 lang = obj.language if lang == 'en' or lang == 'de': print str(count) + ': Indexing termBroader: ' + str( obj) self.addTermBroader(tagSubjectList, obj) if predicate == SKOS.narrower: count += 1 lang = obj.language if lang == 'en' or lang == 'de': print str(count) + ': Indexing termNarrower: ' + str( obj) self.addTermNarrower(tagSubjectList, obj) self.addSpellings() self.commit() splitChars = re.compile('[ ="._,:;/\?\(\)\]\[\!\*]') def addToWordList(self, words, filterShort=None): lang = words.language wordList = self.splitChars.split(words) for word in wordList: if len(word) <= 1: continue if filterShort and len( word ) <= filterShort: # skips short lowercased words if 'filterShort' continue if lang == 'en': word = utils.eszettToSS(word) self.wordSetEN.add(word) elif lang == 'de': word = utils.eszettToSS(word) self.wordSetDE.add(word) else: translator = Translator() if not word in self.wordSetDE and word not in self.wordSetEN: try: transWordDE = translator.translateENToDE(word) transWordDE = utils.eszettToSS(transWordDE) self.wordSetDE.add(transWordDE) self.wordSetEN.add(utils.eszettToSS(word)) except: pass def addSpellings(self): countEN = 0 countDE = 0 for word in self.wordSetEN: countEN += 1 print str(countEN) + ': Indexing EN spelling for word: ' + word self.__writer.add_document(spellingEN=unicode(word)) for word in self.wordSetDE: countDE += 1 print str(countDE) + ': Indexing DE spelling for word: ' + word self.__writer.add_document(spellingDE=unicode(word)) def getTagsOfRelTerm(self, rdfGraph, relTermSubject): '''Returns a list of subjects, that point to this RelatedTerm 'subject'.''' generatorList = rdfGraph.getRelatedMatch(relTermSubject) return utils.genToList(generatorList) def createNewIndex(self): ix = create_in(utils.indexerDir(), self.schema, indexname=utils.indexName) self.__writer = ix.writer() def addTagPrefLabel(self, tagSubject, tagPrefLabel): if not index.exists_in(utils.indexerDir(), utils.indexName): self.createNewIndex() self.__writer.add_document(tagSubject=unicode(tagSubject), tagPrefLabel=unicode(tagPrefLabel)) self.addToWordList(tagPrefLabel) def addTagScopeNote(self, tagSubject, tagScopeNote): if not index.exists_in(utils.indexerDir(), utils.indexName): self.createNewIndex() self.__writer.add_document(tagSubject=unicode(tagSubject), tagScopeNote=unicode(tagScopeNote)) self.addToWordList(tagScopeNote, 5) def addTermPrefLabel(self, tagSubjectList, termPrefLabel): if not index.exists_in(utils.indexerDir(), utils.indexName): self.createNewIndex() for tagSubject in tagSubjectList: self.__writer.add_document(tagSubject=unicode(tagSubject), termPrefLabel=unicode(termPrefLabel)) self.addToWordList(termPrefLabel) def addTermAltLabel(self, tagSubjectList, termAltLabel): if not index.exists_in(utils.indexerDir(), utils.indexName): self.createNewIndex() for tagSubject in tagSubjectList: self.__writer.add_document(tagSubject=unicode(tagSubject), termAltLabel=unicode(termAltLabel)) self.addToWordList(termAltLabel) def addTermBroader(self, tagSubjectList, termBroader): if not index.exists_in(utils.indexerDir(), utils.indexName): self.createNewIndex() for tagSubject in tagSubjectList: self.__writer.add_document(tagSubject=unicode(tagSubject), termBroader=unicode(termBroader)) self.addToWordList(termBroader) def addTermNarrower(self, tagSubjectList, termNarrower): if not index.exists_in(utils.indexerDir(), utils.indexName): self.createNewIndex() for tagSubject in tagSubjectList: self.__writer.add_document(tagSubject=unicode(tagSubject), termNarrower=unicode(termNarrower)) self.addToWordList(termNarrower) def commit(self): self.__writer.commit()
""" doc = WordPunctTokenizer().tokenize(txt) doc = [word for word in doc if word not in stopword_set] doc = [word for word in doc if word.isalpha()] return doc stopword_set = set(stopwords.words('german')) # Index schema analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) if USE_NGRAM: schema = Schema(title=TEXT(stored=True), body=TEXT(analyzer=analyzer, stored=True), ngrams=NGRAMWORDS(minsize=NGRAM_MIN_SIZE, maxsize=NGRAM_MAX_SIZE, stored=False, at=None)) else: schema = Schema(title=TEXT(stored=True), body=TEXT(analyzer=analyzer, stored=True)) # Empty index folder if needed if os.path.exists(INDEX_PATH): shutil.rmtree(INDEX_PATH, ignore_errors=True) os.mkdir(INDEX_PATH) # Remove possible Word2Vec remains for f in glob.glob(OUTPUT_W2VMODEL_BIN + '.*'): os.remove(f) # Get list of files to process
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True) schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
import unicodecsv as csv from whoosh import index, sorting from whoosh.analysis import StandardAnalyzer from whoosh.fields import Schema, STORED, NGRAMWORDS, NUMERIC from whoosh.qparser import MultifieldParser _schema = Schema( ror=STORED(), grid=STORED(), name=NGRAMWORDS(stored=False), aliases=NGRAMWORDS(stored=False), num_students=NUMERIC(int, sortable=True, stored=False), citation_score=NUMERIC(int, sortable=True, stored=False), ) _index_path = 'data/ror-whoosh-index' def _read_ror_csv_rows(): rows = [] with open('data/ror-metrics.csv') as ror_csv: reader = csv.DictReader(ror_csv) for row in reader: row['aliases'] = row['aliases'].split( u'###') if row['aliases'] else [] row['num_students'] = int( row['num_students']) if row['num_students'] else None row['citation_score'] = float( row['citation_score']) if row['citation_score'] else None rows.append(row)
if not os.path.exists(SSAWG_index_dir): print("Creating index folder...") os.mkdir(SSAWG_index_dir) ## Add fields programmatically by parsing the first line of the file Searchable = ('Text', ) ix = index.create_in(SSAWG_index_dir, schema) writer = ix.writer() for field in fieldnames: if field in Searchable: print(field) writer.add_field(field, NGRAMWORDS(minsize=NgramMin, maxsize=NgramMax, stored=True) ) # May need to adjust size to allow for description else: writer.add_field(field, TEXT(stored=True, chars=True)) mtgCnt = 0 for Meeting in Meetings: # Text is NGRAMMED, link is stored #print('-----------------') #print(str(Meeting.text)) StrippedText = '' for item in Meeting.find_all('li'): # for each list item... if item.text: CurStrip = item.text else: CurStrip = '' StrippedText += CurStrip.strip() + '\n'
key = f'{eid}:{locale}:tags' for tag in tags['values']: storage.lpush(key, tag) if __name__ == '__main__': print('-' * 30) print('Muzeeglot data ingestion') print('-' * 30) if exists(configuration.INGESTION_LOCK): print('WARN: ingestion lock detected, pass') else: print('INFO: evaluate tags corpus') tags_corpus = get_tags_corpus() print('INFO: create search index') if not exists(configuration.INDEX): makedirs(configuration.INDEX) schema = Schema(ngram=NGRAMWORDS(), name=STORED(), eid=STORED()) index = create_in(configuration.INDEX, schema) writer = BufferedWriter(index, period=60, limit=200) ingest_languages(writer) ingest_tags(tags_corpus) ingest_entities(tags_corpus, writer) print('INFO: optimize and close index') writer.close() index.optimize() index.close() print('INFO: write ingestion lock') with open(configuration.INGESTION_LOCK, 'w') as stream: stream.write('ingested')
import os, time, threading from whoosh.fields import Schema, KEYWORD, NGRAMWORDS, NUMERIC, TEXT from whoosh.index import create_in, open_dir from whoosh.writing import AsyncWriter from whoosh.qparser import QueryParser, MultifieldParser, FieldsPlugin, FieldAliasPlugin #from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer, NgramFilter, NgramAnalyzer #from whoosh.query import * #https://whoosh.readthedocs.io/en/latest/quickstart.html schema = Schema(id=NUMERIC(stored=True, unique=True, signed=False), category=TEXT, title=NGRAMWORDS(2, 20, True, 2.0), ingredients=KEYWORD, content=NGRAMWORDS(4, 20)) #TODO: Synonyme https://whoosh.readthedocs.io/en/latest/api/lang/wordnet.html search_path = "search" ALWAYS_REBUILD = False min_search_length = 2 if not os.path.exists(search_path): os.mkdir(search_path) def rebuild_index(): index = create_in(search_path, schema) writer = index.writer() writer.add_document(id=0, title="Test Words", content="super nice") writer.add_document(id=1, title="Apple Banana Cucumber") writer.add_document(id=2, title="Deck Elevator Floor", category="test") writer.add_document(id=3, title="Pen Pineapple Apple Pen")
def build_schema(self, fields): # Copied from https://github.com/django-haystack/django-haystack/blob/v2.8.1/haystack/backends/whoosh_backend.py schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = "" for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = WHOOSH_ID( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ["date", "datetime"]: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == "integer": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == "float": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == "boolean": # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == "ngram": schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == "edge_ngram": schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at="start", stored=field_class.stored, field_boost=field_class.boost, ) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=getattr(field_class, "analyzer", StemmingAnalyzer()), field_boost=field_class.boost, sortable=True, ) schema_fields[ field_class.index_fieldname].field_name = field_name if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
from whoosh.fields import Schema, TEXT, STORED, NGRAMWORDS from whoosh.index import create_in, open_dir from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer, NgramFilter, NgramAnalyzer, NgramWordAnalyzer #from whoosh.query import * from whoosh.qparser import QueryParser, MultifieldParser, FieldsPlugin analyzer = NgramAnalyzer(3) schema = Schema( id=STORED, category=TEXT(field_boost=3.0), #title = TEXT(analyzer, False) title=NGRAMWORDS(2, 20, False, 2.0)) index = create_in("search", schema) #index = open_dir("search") writer = index.writer() writer.add_document(id=0, title="Test Words") writer.add_document(id=1, title="Apple Banana Cucumber") writer.add_document(id=2, title="Deck Elevator Floor", category="test") writer.add_document(id=3, title="Pen Pineapple Apple Pen") writer.commit() #parser = QueryParser("title", schema) parser = MultifieldParser(["category", "title"], schema, { "category": 3.0, "title": 2.0 }) parser.remove_plugin_class(FieldsPlugin) with index.searcher() as searcher:
json_data_dir = '/srv/wallacewebapp/climasng/data' species_json_file = 'species.json' summaries_json_file = 'summaries.json' search_index_dir = os.path.join(json_data_dir, 'searchindex') if os.path.isdir('/Users/pvrdwb'): # ..overwrite with local dev paths json_data_dir = '/Users/pvrdwb/projects/climas-global/webapp/climasng/data' search_index_dir = os.path.join(json_data_dir, 'searchindex') # define schema for indexed info schema = Schema(nice_name=NGRAMWORDS(2, 8, at='start', sortable=True, stored=True), name_id=ID(stored=True, unique=True), item_id=ID(stored=True), item_path=STORED, item_type=KEYWORD(stored=True)) debug_output_level = 3 # max 5 (only errors show) # ------------------------------------------------------------------- def msg(message, debug_level=3): ''' debug level goes from 1 (very minor) to 5 (massive problem) ''' if debug_level >= debug_output_level: prefix = ['', 'dbug', 'info', 'mesg', 'warn', 'BOOM'][debug_level]