def __init__(self): path_dir = Path.IndexXmlDir schema = Schema(doc_no=ID(stored=True), doc_subject=TEXT(analyzer=RegexTokenizer(), stored=True), doc_content=TEXT(analyzer=RegexTokenizer(), stored=True)) indexing = index.create_in(path_dir, schema) self.writer = indexing.writer() return
def text_treat(path): conto = { "titulo": "", "categoria": "", "texto": [], "ano": "", "full": [], "tokens": [] } with open(path, 'r', encoding="ISO-8859-1") as arquivo: conto["full"] = arquivo.readlines() analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter( lang="portuguese") #exemplo: Poesia, Americanas, 1875 p = re.compile(r'[/.,]') inf = p.split(conto["full"][0]) #inf = conto["full"][0].split(r"[/.,]") conto["categoria"] = inf[0] conto["titulo"] = inf[1] conto["ano"] = inf[2].replace("\n", "") for i in range(len(conto["full"])): conto["texto"].append(conto["full"][i].replace('\n', '')) #retirar stop words for token in analyzer(conto["texto"][i]): conto["tokens"].append(token.text) #conto["tokens"] = remove_stop_words(conto["texto"][i]) return conto
def remove_stop_words(str): analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter( lang="portuguese") r = [] for token in analyzer(str): r.append(token.text) return r
def test_token_boost(): from whoosh.analysis import RegexTokenizer, DoubleMetaphoneFilter ana = RegexTokenizer() | DoubleMetaphoneFilter() field = fields.TEXT(analyzer=ana, phrase=False) results = list(field.index(u("spruce view"))) assert_equal(results, [('SPRS', 1, 1.0, b('\x00\x00\x00\x01')), ('FF', 1, 0.5, b('\x00\x00\x00\x01')), ('F', 1, 1.0, b('\x00\x00\x00\x01'))])
def create_index(): regex_tokenize = re.compile('\w+(?:-\w+)+|<[A-Z]+>[^<]+</[A-Z]+>|\w+', re.U) tokenizer = RegexTokenizer(regex_tokenize) schema = Schema(sentence=TEXT(stored=True, analyzer=tokenizer)) if not os.path.exists("index_full"): os.mkdir("index_full") idx = create_in("index_full", schema) return idx
def main(): parser = get_parser() args = parser.parse_args() NgramMin = args.nMin NgramMax = args.nMax if NgramMax < NgramMin: NgramMax = NgramMin print('nMax cannot be less than nMin. Setting nMax to nMin') MSID_index_dir = args.outfolder Searchable = args.s MSID_CSV_fnames = args.infiles # TBD: add cmdline flag to set/use a particular index if not os.path.exists(MSID_index_dir): print("Doesn't exist, creating directory %s" % MSID_index_dir) os.mkdir(MSID_index_dir) ## Indicate which fields to NGRAM-ize ## Create Empty Schema schema = Schema() ix = index.create_in(MSID_index_dir, schema) writer = ix.writer() writer.commit(mergetype=writing.CLEAR, optimize=True) # Erase the index to start from scratch writer = ix.writer() for cur_file in MSID_CSV_fnames: with open(cur_file, newline='', encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) fieldnames = reader.fieldnames for field in fieldnames: if not field in ix.schema.names(): if field in Searchable: # NGRAM-ize the field writer.add_field( field, TEXT(stored=True, analyzer=NgramWordAnalyzer( NgramMin, maxsize=NgramMax, tokenizer=RegexTokenizer()), phrase=False) ) # May need to adjust size to allow for description else: # Just store raw text writer.add_field(field, TEXT(stored=True)) writer.commit(optimize=True) writer = ix.writer() idx_cnt = 0 for cur_file in MSID_CSV_fnames: print('Indexing %s' % cur_file) with open(cur_file, newline='', encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: idx_cnt += 1 writer.add_document(**row) last_row = row if idx_cnt % 1000 == 0: print(idx_cnt) print('Indexing Done, committing changes to disk') writer.commit()
def schema(self): my_analyzer = RegexTokenizer("[a-zA-Z_]+") | LowercaseFilter() | StopFilter() schema = Schema( h=TEXT(stored=True, analyzer=my_analyzer), gnx=ID(stored=True), b=TEXT(analyzer=my_analyzer), parent=ID(stored=True), doc=ID(stored=True), ) return schema
class Post(db.Model, ModelSaveMixin): __tablename__ = 'posts' __searchable__ = ['title', 'content'] __msearch_schema__ = { "title": TEXT( stored=True, analyzer=RegexTokenizer() | CaseSensitivizer(), sortable=False), "content": TEXT( stored=True, analyzer=RegexTokenizer(), sortable=False, ) } id = db.Column(db.Integer, primary_key=True) title = db.Column(db.String(49)) content = db.Column(db.Text)
def __init__(self, path, schema): self.analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() self.schema = schema if not os.path.exists( "index"): ## make an index folder if one does not exist os.mkdir("index") index.create_in("index", self.schema) self.ix = index.open_dir("index")
def __init__(self): self.storage = RamStorage() schema = Schema(key=ID(stored=True), \ ask=BOOLEAN(stored=True), \ content=TEXT(stored=True, analyzer=RegexTokenizer())) self.ix = self.storage.create_index(schema) self.writer = self.ix.writer() self.is_train = False for s in greeting.split('\n'): self.train(u'matchinggreeting', s)
def __init__(self): rootPath = pathlib.Path(__file__).parent.parent.__str__() self.inputFilePath = rootPath + Path.InputPickleFile schema = Schema(doc_no=ID(stored=True), doc_content=TEXT(analyzer=RegexTokenizer(), stored=True), doc_title=TEXT(stored=True)) indexing = index.create_in(rootPath + Path.IndexPath, schema) self.writer = indexing.writer() return
def item_name_analyzer(): """ Analyzer behaviour: Input: u"some item name", u"SomeItem/SubItem", u"GSOC2011" Output: u"some", u"item", u"name"; u"Some", u"Item", u"Sub", u"Item"; u"GSOC", u"2011" """ iwf = MultiFilter(index=IntraWordFilter(mergewords=True, mergenums=True), query=IntraWordFilter(mergewords=False, mergenums=False)) analyzer = RegexTokenizer(r"\S+") | iwf | LowercaseFilter() return analyzer
def CleanupStandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False): ret = RegexTokenizer(expression=expression, gaps=gaps) # added CleanupFilter here chain = ret | CleanupFilter() | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter( stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain
def set_schema(self, df_schema): """ Whoosh schema = all df_schema fields, stored but not indexed, + extra field 'body_processed', processed, indexed.""" customWordFilter = RegexTokenizer() | \ LowercaseFilter() | \ CustomFilter(nltk.stem.porter.PorterStemmer().stem) | \ CustomFilter(nltk.WordNetLemmatizer().lemmatize) whoosh_schema = {item: STORED for item in df_schema} whoosh_schema.update( {'body': TEXT(stored=True, analyzer=customWordFilter)}) print('Whoosh_schema', whoosh_schema) return Schema(**whoosh_schema)
def __init__(self): chfilter = CharsetFilter(accent_map) stoplist = stoplists["en"].union(stoplists["fr"]) analyzer = RegexTokenizer() | LowercaseFilter() | \ StopFilter(stoplist=stoplist) | chfilter # defines the schema # see http://pythonhosted.org/Whoosh/schema.html for reference keywordType = KEYWORD(lowercase=True, scorable=True) self.schema = Schema(content=TEXT(analyzer=analyzer), docType=TEXT, docId=ID(stored=True, unique=True), tags=keywordType) # Adds dynamic fields so each documents can index its fields in the # same Whoosh index self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True) self.schema.add('*_date', DATETIME, glob=True) self.schema.add('*_number', NUMERIC, glob=True) self.schema.add('*_boolean', BOOLEAN, glob=True) # Creates the index folder and Whoosh index files if it doesn't exist # And loads the index in any case if not os.path.exists("indexes"): os.mkdir("indexes") self.index = index.create_in("indexes", self.schema) else: self.index = index.open_dir("indexes") # Creates the doctypes folder if it doesn't exist if not os.path.exists("doctypes"): os.mkdir("doctypes") # Creates the doctypes default schema file if it doesn't exist if not os.path.exists('doctypes/doctypes_schema.json'): with open('doctypes/doctypes_schema.json', 'w') as defaultFile: defaultFile.write("{}") ''' Loads the doctypes schema if it's valid, otherwise recreates it Doctypes schema is a dictionary of doctypes with their fields created and updated when a document is indexed. That way, we can tell Whoosh which fields to search by default, because there is apparently no way to say "search in all fields". ''' with open('doctypes/doctypes_schema.json', 'r+') as rawJSON: try: self.doctypesSchema = json.load(rawJSON) except ValueError: rawJSON.write("{}") self.doctypesSchema = {}
def init_env(self): from whoosh import qparser, query, scoring from whoosh.analysis import RegexTokenizer from whoosh.lang.morph_en import variations self.freq_searcher = self.idx.searcher(weighting=scoring.Frequency()) self.tfidf_searcher = self.idx.searcher(weighting=scoring.TF_IDF()) self.bm25_searcher = self.idx.searcher( weighting=scoring.BM25F(B=0.74, K1=1.52)) self.query_parser = QueryParser('abstract', self.idx.schema) self.query_parser.add_plugin(FuzzyTermPlugin()) self.title_parser = QueryParser('title', self.idx.schema) self.title_parser.add_plugin(FuzzyTermPlugin()) self.tokenizer = RegexTokenizer()
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): value = " ".join(jieba.cut_for_search(value)) reg = RegexTokenizer(r"[^ \t\r\n]+") return reg(value, positions, chars, keeporiginal, removestops, start_pos, start_char, mode, **kwargs)
def LemmatizingAnalyzer(stoplist=STOP_WORDS, minsize=2, maxsize=None): """ Analizzatore che effettua tokenizzazione, lowercase, rimozione stopword e lemmatizzazione. :param stoplist: lista di stopword. E' possibile effettuare l'unione con altre un altra lista :param minsize: Parole più piccole di questo valore vengono eliminate :param maxsize: parole più grandi di questo valore vengono eliminate """ ret = RegexTokenizer(expression=default_pattern, gaps=False) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | LemmatizerFilter()
def CleanupStemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False, stemfn=stem, ignore=None, cachesize=50000): ret = RegexTokenizer(expression=expression, gaps=gaps) # added CleanupFilter here chain = ret | CleanupFilter() | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter( stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | StemFilter( stemfn=stemfn, ignore=ignore, cachesize=cachesize)
def search(query): """Takes as input a query string, creates a parser object from that query string and matches entries to that query string from the constructed index. """ matched_attack_vectors = [] # stems the query but it does not seem # to produce better results (to the contrary) # stemmed_query = stem(query) # n-gram filtering---if needed # my_analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4) # n_gram_query = [token.text for token in my_analyzer(query)] # compound word filter takes a query string # and produces a list of individual words to check cwf = CompoundWordFilter(query, keep_compound=True) analyzer = RegexTokenizer(r"\S+") | cwf cwf_query = [t.text for t in analyzer(query)] ix = index.open_dir("indexdir") schema = ix.schema query_parser = MultifieldParser(schema.names(), schema) with ix.searcher() as searcher: for instance in cwf_query: parse_query = query_parser.parse(instance) results = searcher.search(parse_query, limit=100000) for result in results: # transform results back to attack vector definition matched_attack_vectors.append( AttackVector( db_id=result['db_id'], db_name=result['db_name'], name=result['name'], related_weakness=result['related_weakness'], related_vulnerability=result['related_vulnerability'], related_attack_pattern=result[ 'related_attack_pattern'], contents=result['contents'])) return matched_attack_vectors
def create_indexer(doc_directory, index_directory): my_analyzer = RegexTokenizer() | LowercaseFilter() schema = Schema(id=ID(stored=True), title=TEXT(stored=True, analyzer=my_analyzer), summary=TEXT, article=TEXT(analyzer=my_analyzer), keywords=KEYWORD(stored=True, analyzer=my_analyzer), date=DATETIME(stored=True), path=TEXT(stored=True)) if not os.path.exists(index_directory): os.mkdir(index_directory) ix = create_in(index_directory, schema) writer = ix.writer() nt = 0 print("==============================") t1 = time.clock() for dirname, subdirs, files in os.walk(doc_directory): if (files != []): n = 0 for filename in files: filename = os.path.join(dirname, filename) obj = load_json(filename) writer.add_document(id=obj['id'], title=obj['title'], summary=obj['summary'], article=obj['article'], keywords=obj['keywords'], date=obj['date'], path=filename) n += 1 print("{}: {}".format(dirname, n)) nt += n t2 = time.clock() print("==============================") print("Docs: {}, Time: {:.2f}s".format(nt, (t2 - t1))) print("Writing index...") writer.commit() t3 = time.clock() print("Total time: {:.2f}s".format(t3 - t1)) print("==============================")
def queryIndex(query): tokenizer = RegexTokenizer() return_list = [] # Removing stop words with open("../smartStopList.txt", "r") as fp: line = fp.readline() words = [] while line: words.append(line.replace('\n', '')) line = fp.readline() stopper = StopFilter(stoplist=frozenset(words)) tokens = stopper(tokenizer(query)) for t in tokens: t.text = t.text.lower() # Converting to lower case s = stem(t.text) # stemming if len(s) > 2: return_list.append(s) return return_list
from whoosh.analysis import (CharsetFilter, RegexTokenizer, LowercaseFilter, PathTokenizer, NgramFilter) from whoosh.support.charset import accent_map from whoosh.formats import Existence from whoosh.fields import (SchemaClass, FieldType, ID, KEYWORD, DATETIME, TEXT, NUMERIC) from abilian.services.security.models import Role, Anonymous from abilian.core.util import noproxy from abilian.core.models.subjects import User, Group #: A Whoosh analyzer that splits on word boundaries and folds accents and case. accent_folder = ( RegexTokenizer(r'\w+') # defaults doesn't split on '.' | LowercaseFilter() | CharsetFilter(accent_map)) #: Analyzer for edge-ngrams, from 2 to 6 characters long edge_ngram = accent_folder | NgramFilter(minsize=2, maxsize=6, at='start') def EdgeNgramField(): return TEXT(stored=False, analyzer=edge_ngram) class _DefaultSearchSchema(SchemaClass): """ General search schema """
#!/usr/bin/env python import os from whoosh.index import create_in from whoosh.fields import Schema, ID, TEXT from whoosh.analysis import LowercaseFilter, RegexTokenizer, StopFilter # Analizadores que utilizará el schema my_analizer = RegexTokenizer() | LowercaseFilter() | StopFilter(lang="es") # Esquema en el que se guarda el título e ID schema = Schema(title=TEXT(stored=True), path=ID(stored=True), num_noticia=TEXT(stored=True), doc=TEXT(stored=True), content=TEXT) #(analyzer=my_analizer)) # Nombre del directorio donde se guardará el índice idir = "index_dir" # Creación del directorio donde se guarda el índice if not os.path.exists(idir): os.mkdir(idir) ix = create_in(idir, schema) # El writer añadirá los índices writer = ix.writer() # Ficheros a añadir nomF = os.listdir("./enero") for filename in nomF: f = open("./enero/" + filename, mode='r') f = str(f.read()).split("<DOC>")
# RhodeCode Enterprise Edition, including its added features, Support services, # and proprietary license terms, please see https://rhodecode.com/licenses/ """ Whoosh fallback schema for RhodeCode in case rhodecode_tools defined one is not available """ from __future__ import absolute_import from whoosh.analysis import RegexTokenizer, LowercaseFilter from whoosh.formats import Characters from whoosh.fields import (TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType, DATETIME) # CUSTOM ANALYZER wordsplit + lowercase filter for case insensitive search ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() # FILE INDEX SCHEMA DEFINITION FILE_INDEX_NAME = 'FILE_INDEX' FILE_SCHEMA = Schema( fileid=ID(unique=True), # Path repository=ID(stored=True), repository_id=NUMERIC(unique=True, stored=True), # Numeric id of repo repo_name=TEXT(stored=True), owner=TEXT(), path=TEXT(stored=True), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(),
def NgramWordAnalyzer(): return RegexTokenizer() | LowercaseFilter() | NgramFilter()
def get_schema(): return Schema(filename=ID(unique=True, stored=True), content=TEXT(phrase=True, analyzer=RegexTokenizer(r"[^ \n]+")))
def create_analyzer(): conf = config.get_config() if conf['STOPWORDS']: if conf['CHARACTERS_FOLDING']: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) else: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() else: if conf['CHARACTERS_FOLDING']: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | CharsetFilter(accent_map) | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | CharsetFilter(accent_map) | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | CharsetFilter(accent_map) \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | CharsetFilter(accent_map) else: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() log.print_debug(TAG, "Analizzatore creato") return analyzer
def main(): file_content_doc1 = open("rural_min.txt").read() file_content_doc2 = open("science_min.txt").read() option = True while option: print(""" 1. Create Index. 2. Query Index. 3. Exit """) option = input("Please select an option...!") if option == "1": sent_tokenize_list1 = sent_tokenize(file_content_doc1, language='english') sent_tokenize_list2 = sent_tokenize(file_content_doc2, language='english') if not os.path.exists("index_task3_min"): os.mkdir("index_task3_min") my_analyzer = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | Lemmatizer() pos_tagger = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | PosTagger() wordnetsyn1 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets() wordnetsyn2 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets1() wordnetsyn3 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets2() wordnetsyn4 = RegexTokenizer() | StopFilter() | LowercaseFilter( ) | WordNetSynsets3() schema = Schema(id=ID(stored=True, unique=True), standard=TEXT(stored=True, analyzer=StandardAnalyzer()), stem_text=TEXT(stored=True, analyzer=StemmingAnalyzer()), lemma=TEXT(stored=True, analyzer=my_analyzer), pos_text=TEXT(stored=True, analyzer=pos_tagger), hypernym=TEXT(stored=True, analyzer=wordnetsyn1), hyponym=TEXT(stored=True, analyzer=wordnetsyn2), holonym=TEXT(stored=True, analyzer=wordnetsyn3), meronyms=TEXT(stored=True, analyzer=wordnetsyn4), dependency=TEXT(analyzer=DependencyParser())) ix = index.create_in("index_task3_min", schema) writer = ix.writer() for sentence in sent_tokenize_list1: writer.add_document(standard=sentence, stem_text=sentence, lemma=sentence, pos_text=sentence, hypernym=sentence, hyponym=sentence, meronyms=sentence, holonym=sentence, dependency=sentence) for sentence in sent_tokenize_list2: writer.add_document(standard=sentence, stem_text=sentence, lemma=sentence, pos_text=sentence, hypernym=sentence, hyponym=sentence, meronyms=sentence, holonym=sentence, dependency=sentence) writer.commit() print_index_details(ix) print("\n\n Index created with various features as its fields") elif option == "2": ix = index.open_dir("index_task3") with ix.searcher(weighting=whoosh.scoring.BM25F()) as searcher: og = qparser.OrGroup.factory(0.5) q = input("\n Insert a query...!") query_text = MultifieldParser([ "standard", "stem_text", "lemma", "pos_text", "hyponym", "meronyms", "hypernym", "holonym" ], schema=ix.schema, group=og).parse(q) results = searcher.search(query_text, limit=10) for i, hit in enumerate(results): print(results.score(i), hit["standard"], sep=":") print("\n") elif option == "3": print("\n Goodbye") sys.exit(0) option = None else: print("\n Not valid choice try again...!")
class CustomFuzzyTerm(FuzzyTerm): """ Custom FuzzyTerm query parser to set a custom maxdist """ def __init__(self, fieldname, text, boost=1.0, maxdist=1): FuzzyTerm.__init__(self, fieldname, text, 1.0, 2) logger = logging.getLogger("indexer" + __name__) ##==========================={Index-Schema}===================================== chfilter = CharsetFilter(accent_map) stoplist = stoplists["en"].union(stoplists["ru"]) analyzer = (RegexTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist) | chfilter) # Define the schema keywordType = KEYWORD(lowercase=True, scorable=True) def add_fields(schema): """ * -------------{Function}--------------- * Add dynamic fields so each document can index its fields in * the same Whoosh index * -------------{returns}---------------- * Whoosh Schema . . . * -------------{params}----------------- * : whoosh.fields.Schema