action='store_true', help='lookup ranking of vulnerable configuration') args = argParser.parse_args() if not args.q and not args.l and not args.g and not args.m: argParser.print_help() exit(1) if args.f or args.t: from lib import CVEs cves = CVEs.last(rankinglookup=args.r, namelookup=args.n) if args.q: with ix.searcher() as searcher: if not args.o: query = QueryParser("content", ix.schema).parse(" ".join(args.q)) else: query = QueryParser("content", schema=ix.schema, group=qparser.OrGroup).parse(" ".join(args.q)) results = searcher.search(query, limit=None) for x in results: if not args.f: print(x['path']) else: print( json.dumps(cves.getcve(x['path']), sort_keys=True, default=json_util.default)) if args.t and not args.f:
def get_more_search_result(): query = request.form['query'] q = [] q.append(query) page_offset = int(request.form['page_offset']) index_name = request.form['index_name'] num_elem_to_get = 50 # select correct index if index_name is None or index_name == "0": selected_index = get_current_index() else: selected_index = os.path.join(baseindexpath, index_name) path_array = [] preview_array = [] date_array = [] size_array = [] list_tags = [] schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) ix = index.open_dir(selected_index) with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(" ".join(q)) results = searcher.search_page(query, page_offset, num_elem_to_get) for x in results: path = x.items()[0][1] path = path.replace(PASTES_FOLDER, '', 1) path_array.append(path) paste = Paste.Paste(path) content = paste.get_p_content() content_range = max_preview_char if len( content) > max_preview_char else len(content) - 1 preview_array.append(content[0:content_range]) curr_date = str(paste._get_p_date()) curr_date = curr_date[0:4] + '/' + curr_date[ 4:6] + '/' + curr_date[6:] date_array.append(curr_date) size_array.append(paste._get_p_size()) p_tags = r_serv_metadata.smembers('tag:' + path) l_tags = [] for tag in p_tags: complete_tag = tag tag = tag.split('=') if len(tag) > 1: if tag[1] != '': tag = tag[1][1:-1] # no value else: tag = tag[0][1:-1] # use for custom tags else: tag = tag[0] l_tags.append((tag, complete_tag)) list_tags.append(l_tags) to_return = {} to_return["path_array"] = path_array to_return["preview_array"] = preview_array to_return["date_array"] = date_array to_return["size_array"] = size_array to_return["list_tags"] = list_tags to_return["bootstrap_label"] = bootstrap_label if len(path_array) < num_elem_to_get: #pagelength to_return["moreData"] = False else: to_return["moreData"] = True return jsonify(to_return)
def search( self, given_query='', #search function in_query=[''], ex_query=[''], diets=[], allergies=[], page=1, ranking="BM25"): # These are only for parsing not for filling the results keys = [ 'name', 'ingredients', 'cautions', 'dietLabels', 'healthLabels' ] try: #open the index index = open_dir('WhooshIndex') except Exception: self.index() #make the index if it doesnt exist index = open_dir('WhooshIndex') if ranking == "TF-IDF": #set the ranking algorithm ranking = scoring.TF_IDF() else: ranking = scoring.BM25F() with index.searcher(weighting=ranking) as searcher: # Universal all docs in case of None # because in the intersection the smaller # result will be returned parser = QueryParser('url', schema=index.schema) q = parser.parse('http OR https') all_docs = searcher.search(q, limit=None) # Creates an empty result for a filter and mask p = QueryParser('id', schema=index.schema) q = p.parse('') myMask = searcher.search(q, limit=None) myFilter = searcher.search(q, limit=None) # include query parsing if in_query != ['']: in_parser = QueryParser('ingredients', schema=index.schema) inFilter = searcher.search(q, limit=None) in_q = in_parser.parse( in_query[0]) #get the first ingredient... in_r = searcher.search(in_q, limit=None) inFilter.extend(in_r) for q in in_query: in_q = in_parser.parse( q ) #take the intersection of remaining docs with docs containing next ingredient in_r = searcher.search(in_q, limit=None) inFilter.filter(in_r) myFilter.extend(inFilter) # exclude query parsing if ex_query != ['']: ex_parser = QueryParser('ingredients', schema=index.schema) for q in ex_query: ex_q = ex_parser.parse(q) ex_r = searcher.search(ex_q, limit=None) myMask.extend(ex_r) #list of docs to mask # allergies query parsing if allergies != []: allergy_parser = QueryParser('cautions', schema=index.schema) for q in allergies: allergy_q = allergy_parser.parse(q) allergy_r = searcher.search(allergy_q, limit=None) myMask.extend(allergy_r) #list of docs to mask # diets query parsing if diets != []: p = QueryParser('id', schema=index.schema) q = p.parse('') dietFilter = searcher.search(q, limit=None) diet_parser = QueryParser('dietInfo', schema=index.schema) diet_q = diet_parser.parse(diets[0]) diet_r = searcher.search(diet_q, limit=None) #get the first diet dietFilter.extend(diet_r) for d in diets: diet_q = diet_parser.parse(d) diet_r = searcher.search(diet_q, limit=None) dietFilter.filter( diet_r ) #take the intersection of whats already in the filter and the new docs to filter by if ( in_query == [''] ): #if we had no ingredients filter, let the filter be the diet filter myFilter.extend(dietFilter) else: myFilter.filter( dietFilter ) #otherwise the filter is the intersection of our two filters # filtering results to get intersection # print(type(results)) # Check if the filter is empty so we don't intersect nothing if (diets == [] and in_query == ['']): myFilter = all_docs elif myFilter.scored_length( ) == 0: #if we filtered and got nothing, we should return nothing payload = {} payload_entries = list() payload['entries'] = payload_entries payload['total'] = 0 return payload if given_query != '' and given_query != None: #the actual search if given_query[0] == '"' and given_query[-1] == '"': given_query = given_query[1:-1] parser = MultifieldParser(keys, schema=index.schema) else: parser = MultifieldParser(keys, schema=index.schema, group=OrGroup) query = parser.parse(given_query) results = searcher.search_page(query, page, filter=myFilter, mask=myMask) else: parser = QueryParser( 'url', schema=index.schema ) #if we arent given a query for the search, filter and mask all docs q = parser.parse('http OR https') results = searcher.search_page(q, page, filter=myFilter, mask=myMask) # Format results for returning payload = {} payload_entries = list() for x in results: payload_entries.append({ 'name': x['name'], 'image': x['image'], 'id': x['id'] }) payload['entries'] = payload_entries payload['total'] = len(results) return payload
def filter_queryset(self, request, queryset, view): if ('parent' in request.query_params and request.query_params['parent'] == ''): # Empty string means query for null parent queryset = queryset.filter(parent=None) try: q = request.query_params['q'] except KeyError: return queryset # Short-circuit some commonly used queries COMMON_QUERY_TO_ORM_FILTER = { 'asset_type:block': {'asset_type': 'block'}, 'asset_type:question': {'asset_type': 'question'}, 'asset_type:survey': {'asset_type': 'survey'}, 'asset_type:question OR asset_type:block': { 'asset_type__in': ('question', 'block') } } try: return queryset.filter(**COMMON_QUERY_TO_ORM_FILTER[q]) except KeyError: # We don't know how to short-circuit this query; pass it along to # the search engine pass except FieldError: # The user passed a query we recognized as commonly-used, but the # field was invalid for the requested model return queryset.none() queryset_pks = list(queryset.values_list('pk', flat=True)) if not len(queryset_pks): return queryset # 'q' means do a full-text search of the document fields, where the # critera are given in the Whoosh query language: # https://pythonhosted.org/Whoosh/querylang.html search_queryset = SearchQuerySet().models(queryset.model) search_backend = search_queryset.query.backend if not isinstance(search_backend, WhooshSearchBackend): raise NotImplementedError( 'Only the Whoosh search engine is supported at this time') if not search_backend.setup_complete: search_backend.setup() # Parse the user's query user_query = QueryParser('text', search_backend.index.schema).parse(q) # Construct a query to restrict the search to the appropriate model filter_query = Term(DJANGO_CT, get_model_ct(queryset.model)) # Does the search index for this model have a field that allows # filtering by permissions? haystack_index = haystack.connections[ 'default'].get_unified_index().get_index(queryset.model) if hasattr(haystack_index, 'users_granted_permission'): # Also restrict the search to records that the user can access filter_query &= Term( 'users_granted_permission', request.user.username) with search_backend.index.searcher() as searcher: results = searcher.search( user_query, filter=filter_query, scored=False, sortedby=None, limit=None ) if not results: # We got nothing; is the search index even valid? if not searcher.search(filter_query, limit=1): # Thre's not a single entry in the search index for this # model; assume the index is invalid and return the # queryset untouched return queryset pk_type = type(queryset_pks[0]) results_pks = { # Coerce each `django_id` from unicode to the appropriate type, # usually `int` pk_type((x['django_id'])) for x in results } filter_pks = results_pks.intersection(queryset_pks) return queryset.filter(pk__in=filter_pks)
results = [] resSetLocal = set() resSetTotal = set() if len(fields) > 0 and len(fields) == len(myquery): i = 0 while i < len(fields): # se è di tipo: publication.title if "." in fields[i]: if fields[i].split(".")[0] not in type and not fields[i].split(".")[0] in schemaFields: type.append(fields[i].split(".")[0]) # if myquery[i][0] == "\"": # qparser = QueryParser(fields[i].split(".")[1], schema=ix.schema) # else: # qparser = QueryParser(fields[i].split(".")[1], schema=ix.schema, group=whoosh.qparser.OrGroup) qparser = QueryParser(fields[i].split(".")[1], schema=ix.schema) query = qparser.parse(myquery[i]) resTmp = searcher.search(query, limit=resultLimiter) runtime = resTmp.runtime if len(resSetTotal) == 0: for res in resTmp: el = Hit(res) resSetTotal.add(el) else: resSetLocal = set() for res in resTmp: el = Hit(res) resSetLocal.add(el) if len(resSetTotal) > 0 and len(resSetLocal) > 0: set1 = set(x.dic["key"] for x in resSetTotal) set2 = set(x.dic["key"] for x in resSetLocal)
def resolveQuery(query): # Add definition here # text = query + "IT WORKS LOLOLOLOL!" # Replace with suitable query -> Value post = query similar = [] gg = [i for i in Complete] # print(Text) for i in range(0, len(gg)): try: name = gg[i] X1 = Complete[gg[i]] # print(Text) vectorizer = Vectorizers[gg[i]] new_post_vec1 = vectorizer.transform([post]) for j in range(0, 10): try: dist = sp.linalg.norm((new_post_vec1 - X1[j]).toarray()) # print(dist) #print(j) Text[i][j].replace('\\n', '<br>') Text[i][j].replace('\n', '<br>') similar.append((dist, name, Text[i][j])) print(dist) except Exception as e: print(e) break except Exception as e: print(e) similar = sorted(similar) gg = list() with index.searcher() as searcher: query = QueryParser("content", index.schema).parse(u'Holding Pattern') results = searcher.search(query) for result in results: print(result) f = dict(result) gg.append( dict({ 'result-text': ''.join(re.escape(f['content']).split('\\')).replace( '\n', '<br>').replace('\t', ' '), 'result-image': "asd", 'result-doc-link': 'google.com', 'result-doc-name': f['filename'], 'result-modified-date': '01-2-2019', 'result-id': "11" })) pgk = gg #print(similar) gg = list() for i in range(0, len(similar)): gg.append( dict({ 'result-text': ''.join(re.escape(similar[i][-1]).split('\\')).replace( '\n', '<br>').replace('\t', ' '), 'result-image': "asd", 'result-doc-link': 'google.com', 'result-doc-name': similar[i][1], 'result-modified-date': '01-2-2019', 'result-id': "123" })) #print(gg) gg = pgk + gg return gg
content=TEXT(stored=True, analyzer=ChineseAnalyzer())) print("start loading content...") if not os.path.exists("index2"): os.mkdir("index2") ix = create_in("index2", schema) read_corpus_and_process() else: ix = open_dir("index2") print('start loading query') query_idx, query_list = read_query_and_process() print("start 1 searching ...") parser = QueryParser("content", ix.schema, group=qparser.OrGroup) nparser = QueryParser("content", ix.schema, group=qparser.NotGroup) rank_list = [[] for i in range(len(query_list))] content_list = [[] for i in range(len(query_list))] with ix.searcher(weighting=scoring.BM25F(B=0.9, K1=1.2)) as searcher: for i in range(len(query_list)): query = parser.parse(query_list[i]) neg_query = parser.parse(' '.join(neg_words)) if any(word in query_list[i] for word in neg_words): query = query | neg_query else: for word in neg_words: query = query | nparser.parse(word) print(query) results = searcher.search(query, limit=300) for hit in results:
def search(domain): """ Search your indexed website. """ # Look for index for requested domain try: ix = index.open_dir( os.path.expanduser('~/.sitesearcher/index'), indexname=domain) except (EmptyIndexError, OSError): click.echo(""" No index was found for domain {0}. Use "sitesearcher indexer {0}" to create one and try again. """.format(domain), err=True) return searchterm = click.prompt('Please enter your search') parser = QueryParser("content", schema=ix.schema) with ix.searcher() as searcher: pagenum = 1 # Paging for search results while pagenum > 0: results = searcher.search_page(parser.parse(searchterm), pagenum) results.results.formatter = ConsoleFormatter() if results.results.is_empty(): click.echo("No results found!") else: click.echo("Search results:") click.echo() # Output all results for current page in nice readable format for result in results: click.echo("Result #{}".format(result.rank + 1)) click.echo("URL: {}".format(result['url'])) # As the site content is not stored locally # send a request to get the content of the search result url request = Request( result['url'], headers={'User-Agent': get_user_agent()} ) response = urlopen(request).read() click.echo("Extract:") content = clean_response_body(response) # Provide console color highlighting for result snippet = result.highlights("content", text=content) snippet_parts = re.split( '(<searchphrase>.*?</searchphrase>)', snippet, re.DOTALL ) for snippet_part in snippet_parts: if snippet_part.startswith('<searchphrase>'): searchphrase = re.search( '<searchphrase>(.*?)</searchphrase>', snippet_part, re.DOTALL ).group(1) click.echo( click.style(searchphrase, fg='blue', bold=True), nl=False ) else: click.echo(snippet_part, nl=False) click.echo('\n') # Handle pagination if results.pagenum < results.pagecount: click.echo( 'Press any key to see next result page or <ESC> to abort', nl=False ) char = click.getchar() click.echo() if char != u'\x1b': pagenum += 1 continue pagenum = -1
def search(self, keyword, limit=50): with self.index.searcher(closereader=False) as searcher: query = QueryParser("content", self.index.schema).parse(keyword) results = searcher.search(query, limit=limit) return results
if args.t: xr = ix.searcher().reader() for x in xr.most_frequent_terms("content", number=500, prefix=''): print(x) exit(0) if args.s: # By default, the index is not storing the vector of the document (Whoosh # document schema). It won't work if you don't change the schema of the # index for the content. It depends of your storage strategy. docnum = ix.searcher().document_number(path=args.s) r = ix.searcher().more_like(docnum, "content") for hit in r: print(hit["path"]) exit(0) if args.q is None: argParser.print_help() exit(1) with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(" ".join(args.q)) results = searcher.search(query, limit=None) for x in results: if args.f: print(readdoc(path=x.items()[0][1])) else: print(x.items()[0][1]) print
def find_unused_templates(): start = time.perf_counter() print('Finding all unused templates...') print(' Getting global templates...') global_templates_files, global_templates = find_global_templates() print(' Done.\n Getting app templates...') app_templates_files, app_templates = find_app_templates() print(' Done.') templates = global_templates + app_templates template_files = global_templates_files + app_templates_files # templates.sort() template_files.sort() print(' Getting python files...') py_files, pys = find_py_files() print(' Done.') all_files = py_files + template_files tl_count = [0 for t in templates] unused_templates = [] print(' Creating Index', end='') tmp_dir = TemporaryDirectory() schema = Schema( title=TEXT(stored=True), path=ID(stored=True), content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+")))) ix = create_in(tmp_dir.name, schema) writer = ix.writer() for filename in all_files: print('.', end='', flush=True) with open(filename, 'r') as f: writer.add_document(title=filename, path=filename, content='/n'.join(f.readlines())) print('', flush=True) writer.commit() print(' Done.') print(' Searching through templates for references', end='', flush=True) with ix.searcher() as searcher: for count, template in enumerate(templates): print('.', end="", flush=True) query = QueryParser("content", ix.schema).parse(template) results = searcher.search(query) if len(results) < 1: unused_templates.append(template) print('', flush=True) print(' Done.') if not unused_templates: print('No unused templates found.') else: print('\nUnused templates:') for template in unused_templates: print(template) end = time.perf_counter() print('Finished in ' + str(end - start) + ' seconds.') return unused_templates
writer.commit() ############################################################################# """ Perform queries on Whoosh indexed data """ # Write a python program that takes queries (you need to design the supported queries) # and search through the indexed archive using whoosh. A sample query to the program # can be: RT:yes, keywords returns all the retweets that are related to the keywords. # Your program should handle at least 4 queries ( of your choice) similar to the sample query. from whoosh.query import Term, And, Or from whoosh.qparser import QueryParser searcher = index.searcher() parser = QueryParser("strong_hashtags", index.schema) parser.parse("FIFAWWC USA JPN") # Query 1: Player search query = And([Term("tweet_text","tobin"),Term("tweet_text","heath")]) results = searcher.search(query) print('# of hits:', len(results)) print('Best Match:', results[0]) # Query 2: Player search query = And([Term("tweet_text","alex"),Term("tweet_text","morgan")]) results = searcher.search(query) print('# of hits:', len(results)) print('Best Match:', results[0])
def __writeDoc2(self, ix, path, occurrences): """ Writes single documents into the Index. Receives the path to a single question directory and grabs the necessary pdf, LaTeX and python files to add to the Index. Also checks to see if the files are already in the Index before adding them. Parameters ---------- arg1 : Index Index to add the documents to. arg2 : string Path to the question directory. arg3 : int Inicial number of occurrences for the file. Returns ------- bool Value of the operation. """ writer = ix.writer() dateNow = dt.now() pathPdf = path + "true_or_false_question.pdf" textPdf = self.__decodePdf(pathPdf) pathPy = path + "program.py" textPy = self.__decodePy(pathPy) pathTex = path + "true_or_false_question.tex" textTex = self.__decodeTex(pathTex) finalPdfText = analyzeText(textPdf) # print(finalPdfText) flag = False with ix.searcher() as seacher: query = QueryParser("content", schema=ix.schema) parse = query.parse(finalPdfText) result = seacher.search(parse) flag = True if not result.is_empty(): prevOc = result[0]["nOccurrences"] prevPath = result[0]["path"] prevDate = result[0]["date"] prevTags = result[0]["tags"] writer.update_document( path=prevPath, content=finalPdfText, date=prevDate, tags=prevTags, nOccurrences=prevOc + 1, ) else: writer.add_document( path=pathPdf, content=finalPdfText, date=dateNow, tags="pdf", nOccurrences=1, ) if textPy is not None: writer.add_document(path=pathPy, content=analyzeText(textPy), date=dateNow, tags="py") if textPy is not None: textTex += textPy writer.add_document(path=pathTex, content=analyzeText(textTex), date=dateNow, tags="tex") writer.commit() return flag
# -*- coding: utf-8 -*- """ Created on Tue Sep 30 18:04:51 2014 @author: dlmu__000 """ from whoosh.index import create_in from whoosh.fields import * schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) indexdir = "." ix = create_in(indexdir, schema) writer = ix.writer() writer.add_document(title=u"First document", path=u"/a", content=u"This is the first document we've added!") writer.add_document(title=u"Second document", path=u"/b", content=u"The second one is even more interesting!") writer.commit() from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse("first") results = searcher.search(query) print results[0]
from whoosh.qparser import QueryParser from whoosh import index, sorting, scoring from whoosh import qparser, query from config import SEARCH_INDEX_DIR import math import unittest from datausa.attrs.search import do_search ix = index.open_dir(SEARCH_INDEX_DIR) qp = QueryParser("name", schema=ix.schema, group=qparser.OrGroup) facet = sorting.FieldFacet("zvalue", reverse=True) scores = sorting.ScoreFacet() class TestStringMethods(unittest.TestCase): NY_IDS = ['31000US35620', '05000US36061', '04000US36', '16000US3651000'] def test_extra_word(self): data,suggs,tries,my_vars = do_search("new york economy") self.assertTrue(data[0][0] in self.NY_IDS) def test_manhattan(self): data,suggs,tries,my_vars = do_search("manhattan") self.assertEqual(data[0][0], "05000US36061") def test_exact_match_begin(self): data,suggs,tries,my_vars = do_search("nome") self.assertEqual(data[0][0], '16000US0254920') def test_ny(self): data,suggs,tries,my_vars = do_search("new york")
def get_des_vector(): # nltk.download('stopwords') # nltk.download('punkt') # nltk.download('averaged_perceptron_tagger') db = pymysql.connect(host="localhost", user="******", passwd="123456", db="project") cursor = db.cursor() cursor.execute("SELECT name,description,catalog from detail d") data = cursor.fetchall() # 这里必须把fetch回来的data转换为list的格式,否则DataFrame会在初始化的时候报错。 data = list(data) data = [list(i) for i in data] df = DataFrame(data, columns=["A", "B", "C"]) # print(df) df1 = df[['B']] # print(str(df)) # print(df) doclist = df1.values # print(doclist) tempdoclist = [] mydoclist = [] # r = '[http]{4}\\:\\/\\/([a-zA-Z]|[0-9])*(\\.([a-zA-Z]|[0-9])*)*(\\/([a-zA-Z]|[0-9])*)*\\s?' # tags = set(['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RP', 'RB', 'RBR', 'RBS', 'JJ', 'JJR', 'JJS']) # for index in range(len(doclist)): # text = str(doclist[index]) # text = ' '.join([word for word in text.split()]) # text = re.sub(r, ' ', text) # words = nltk.word_tokenize(text) # pos_tags = nltk.pos_tag(words) # ret = ' '.join([word for word, pos in pos_tags if pos in tags]) # # print(ret) # tempdoclist.append(ret) # # r1 = '[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、::;;~@#¥%……&*()0123456789]+' # cachedStopWords = stopwords.words("english") # for index in range(len(tempdoclist)): # text = str(tempdoclist[index]) # text = ' '.join([word for word in text.split()]) # text = re.sub(r1, ' ', text) # text = ' '.join([word for word in text.split() if word not in cachedStopWords]) # # print(doclist[index]) # # print(text) # # mydoclist.extend(doclist[index]) # mydoclist.append(text) for i in range(len(doclist)): mydoclist.extend(doclist[i]) print(len(mydoclist)) print(mydoclist) df2 = df[['A']] namelist = df2.values mynamelist = [] for i in range(len(namelist)): mynamelist.extend(namelist[i]) print(len(mynamelist)) print(mynamelist) df3 = df[['C']] cataloglist = df3.values mycataloglist = [] for i in range(len(cataloglist)): mycataloglist.extend(cataloglist[i]) print(len(mycataloglist)) print(mycataloglist) schema = Schema(name=TEXT(stored=True), description=TEXT(stored=True), catalog=TEXT(stored=True)) # 创建索引结构 ix = create_in("IndexSearching/index", schema=schema, indexname='indexname') # path 为索引创建的地址,indexname为索引名称 writer = ix.writer() for i in range(len(mydoclist)): writer.add_document(name=str(mynamelist[i]), description=str(mydoclist[i]), catalog=str(mycataloglist[i])) # 此处为添加的内容 print("建立完成一个索引") writer.commit() # 以上为建立索引的过程 new_list = [] index = open_dir("IndexSearching/index", indexname='indexname') # 读取建立好的索引 with index.searcher() as searcher: parser = QueryParser("description", index.schema) #description,搜索域 myquery = parser.parse("map OR internet OR GPS") results = searcher.search(myquery, limit=20) # limit为搜索结果的限制,默认为10 for result1 in results: print(dict(result1)) new_list.append(dict(result1))
os.mkdir("indexdir") ix = index.create_in("indexdir", schema) ix = index.open_dir("indexdir") # Indexation des documents writer = ix.writer() writer.add_document(Name=u"Super Mario World (USA)", Title=u"Super Mario World (USA)") writer.add_document(Name=u"Frogger 2 - Swampy's Revenge (USA)", Title=u"Frogger 2 - Swampy's Revenge (USA)") writer.add_document(Name=u"akumajou", Title=u"Akuma-Jou Dracula (Japan ver. N)") writer.commit() with ix.searcher() as searcher: query = QueryParser("Name", ix.schema).parse(u'World Super') results = searcher.search(query) # Résultats found = results.scored_length() if results.has_exact_length(): print("Scored", found, "of exactly", len(results), "documents") else: low = results.estimated_min_length() high = results.estimated_length() print("Scored", found, "of between", low, "and", high, "documents") input("Press Enter to continue...")
ix = create_in('./indexdir/', schema) # add documents news = utility.io.load_news() writer = ix.writer() print('Add documents...') for i, x in enumerate(news): if i % 1000 == 0: print('\t%d documents have been added.' % i) writer.add_document(title='news_%06d' % (i + 1), content=x) writer.commit() else: print('Directly open previous indexed directory...') ix = open_dir('./indexdir') print('Searching...') parser = QueryParser('content', schema=ix.schema) with ix.searcher() as searcher: queries_1 = utility.io.load_queries() queries_2, news_index, relevance = utility.io.load_training_data() td_sz = len(relevance) L = list() for idx, keyword in enumerate(QUERIES): ques = queries_1[idx] popout = [] popout_0 = [] popout_1 = [] popout_2 = [] popout_3 = [] for j in range(td_sz): if queries_2[j] == ques and relevance[j] == 3 and (news_index[j] not in popout):
ix = index.create_in("indexdir", schema) writer = ix.writer() for root, dirs, filenames in os.walk('line_item_pkls'): for f in filenames: if (f != '.DS_Store'): body = "" line_items = pickle.load( io.open('line_item_pkls/' + f, 'r', encoding='utf-8')) for item in line_items: for i in range(int(item["number"])): body += item["main item"].decode('unicode-escape') + " " writer.add_document(title=f.decode('unicode-escape'), body=body) writer.commit() with ix.searcher() as searcher: qp = QueryParser("body", schema=ix.schema) print "Hello friends, what would you like to eat today?", terms = raw_input() q = qp.parse(terms.decode('unicode-escape')) results = searcher.search(q) print results[0:9]
import sys from whoosh.index import open_dir from whoosh.qparser import QueryParser import urllib.parse ix = open_dir("index") results = None searcher = ix.searcher() parser = QueryParser("title", ix.schema) query = parser.parse(" ".join(sys.argv[1:])) print(query) results = searcher.search(query, limit=None) print(results) for result in results: print(result['title'], '\n\t', result['content'])
def search(indices): inpval = 1 # ----------》以上就是得到按照哪种分类搜索 qparsers = [] if inpval == 1: categories = [] # 注意这里有问题, 当增量索引文档的时候, 容易出现重复, 那么数据也会重复, 尼玛原点文档就在app函数里 for ix in indices: # ix: FileIndex(FileStorage('indexdir'), 'superfamicom.db') cat = list(ix.schema._fields.keys()) # ix.schema: <Schema: ['Developer', 'Publisher', 'ReleaseDate', 'Title']> # ix.schema._fields {'ReleaseDate': TEXT(format=Positions(boost=1.0), scorable=True, stored=True, unique=None),.... # cat: ['Developer', 'ReleaseDate', 'Publisher', 'Title'] print("cat:", cat) for val in cat: if val in categories: val = val + " - " + ix.indexname # 如果有重复的, 假设得以区分 categories.append(val) # print("categories", categories) # ----------------->以上是得到categories【】, 实际是为了区分每个表的字段是否重复并都放都一个list s = "如果为1的话,是按类别选择, 选择按照那种分类: \n" # print(categories) inpval = 0 # indices: # [FileIndex(FileStorage('indexdir'), 'dinosaur.db'), FileIndex(FileStorage('indexdir'), 'mmorpg.db'), FileIndex(FileStorage('indexdir'), 'superfamicom.db')] for i in range(0, len(categories)): # go through all catergories giving options s += "\t" + str(i) + " to search in [" + categories[i] + "] \n" s += "--->: " inpval = int(input(s)) # 输入从哪项进行查询 for i in range(0, len(indices)): qparsers.append(QueryParser(categories[inpval], indices[i].schema)) # results 为结果 print("你要查询的内容:") inp = input("--->") data = inp.split('--->') print(data[0]) queries = [qparsers[i].parse(data[0].strip()) for i in range(0, len(qparsers))] print("len(queries): ", len(queries)) print("len(indices):", len(indices)) # if len(data) > 1: # limits = int(data[1].strip()) # limits = 10 # limits 为显示结果的数量 results = [] stats = {} limit = 10 for i in range(0, len(indices)): # indices: [FileIndex(FileStorage('indexdir'), 'dinosaur.db'), searcher = indices[i].searcher() res = searcher.search(queries[i], limit=limit) if len(res) != 0: # ix.indexname--> superfamicom.db stats[indices[i].indexname] = len(res) else: continue results.extend(res) # print(results) my_result = [] for i in results: my_result.append(dict(i)) print(my_result)
#cltk_index.index_corpus() #_results = cltk_index.corpus_query('amicitia') #_results = cltk_index.corpus_query('ἀνὴρ') #print(_results) user_dir = os.path.expanduser('~/cltk_data/user_data/search') output_file = 'amicitia.html' output_path = os.path.join(user_dir, output_file) _index = open_dir('/Users/kyle/cltk_data/latin/index/phi5/work/') query = 'amicitia' output_str = '' with _index.searcher() as searcher: _query = QueryParser("content", _index.schema).parse(query) results = searcher.search(_query, limit=None) results.fragmenter.charlimit = None # Allow larger fragments results.fragmenter.maxchars = 300 # Show more context before and after results.fragmenter.surround = 50 docs_number = searcher.doc_count_all() output_str += 'Docs containing hits: {}.'.format( docs_number) + '</br></br>' for hit in results: author = hit['author']
def run_query(text, index, bm25_params={}, **kwargs):#, qf="title_text_en^2 abstract_text_en^2 body_text_en^1.1", fields=['id','score'], size=1000, max_year=2016): if type(index) is pysolr.Solr: kwargs['verb']=1 qf="text^1" if 'qf' not in kwargs else kwargs['qf'] return_fields=['id','score'] if 'return_fields' not in kwargs else kwargs['return_fields'] #return fields size=1000 if 'size' not in kwargs else kwargs['size'] max_year=2016 if 'max_year' not in kwargs else kwargs['max_year'] parser='edismax' if 'parser' not in kwargs else kwargs['parser'] if 'verb' in kwargs: print(text) print(qf) print(bm25_params) if len(bm25_params)>0: bm25.set_params(**bm25_params) q_params={"fl": ','.join(return_fields), #"fq": "body_text_en:[* TO *] AND date_i:[* TO "+str(max_year)+"]", "fq": "date_i:[* TO "+str(max_year)+"]", #"pf": "abstract_text_en^1.2 title_text_en^2", # "start": "1", "rows": str(size), # return maximum 1000 results, "defType": parser } if max_year==0 or max_year>=2016: q_params.pop('fq') if len(qf)>0: q_params["qf"]=qf result = index.search(text, **q_params) return result, return_fields else: kwargs['verb']=1 qf="text^1" if 'qf' not in kwargs else kwargs['qf'] return_fields=['id','score'] if 'return_fields' not in kwargs else kwargs['return_fields'] #return fields size=1000 if 'size' not in kwargs else kwargs['size'] max_year=0 if 'max_year' not in kwargs else kwargs['max_year'] # parser='edismax' if 'parser' not in kwargs else kwargs['parser'] qf_fields=[s.split("^")[0] for s in qf.split()] qf_boosts=[1 if len(s.split("^"))==1 else float(s.split("^")[1]) for s in qf.split()] qff=[f for f,b in zip(qf_fields,qf_boosts) if b!=0] qfb=[b for f,b in zip(qf_fields,qf_boosts) if b!=0] boost_dict={} for f,b in zip(qff, qfb): boost_dict[f]=b if 'verb' in kwargs: print(text) print(qf) print() output=[] if len(bm25_params)>0: w = scoring.BM25F(**bm25_params) else: w = scoring.BM25F() print('Default scoring') with index.searcher(weighting=w) as searcher: query = MultifieldParser(qff, index.schema, fieldboosts=boost_dict, group=OrGroup).parse(text) if max_year>0: mask_q = QueryParser("year", index.schema).parse("date_i:["+str(max_year)+" to]") results = searcher.search(query, limit=size, mask=mask_q) else: results = searcher.search(query, limit=size) for r in results: results_row={} results_row['score']=r.score for f in return_fields: if f not in results_row: # print(r) if f in r: results_row[f]=r[f] else: results_row[f]='' output.append(results_row) return output, return_fields #solr = pysolr.Solr("http://130.155.204.198:8983/solr/trec-cds-2016", timeout=1200) #res1=run_query('adult^1 elderly^1 man^1 calf^1 pain^1 walking^1 uphill^1 history^1 ischemic^1 heart^1 disease^1 worsening^1 hypertension^1 despite^1 medication^1 compliance^1 physical^1 exam^1 right^1 carotid^1 bruit^1 lower^1 extremities^1 cool^1 diminished^1 dorsalis^1 pedis^1 pulses^1', solr, qf='title_text_en^2 abstract_text_en^2 body_text_en^1.1',max_year=2013, size=5) #res2=run_query('adult^1 elderly^1 man^1 calf^1 pain^1 walking^1 uphill^1 history^1 ischemic^1 heart^1 disease^1 worsening^1 hypertension^1 despite^1 medication^1 compliance^1 physical^1 exam^1 right^1 carotid^1 bruit^1 lower^1 extremities^1 cool^1 diminished^1 dorsalis^1 pedis^1 pulses^1', solr, qf='text^1',max_year=2013, size=5) # #rs=[res1, res2] # #for r in rs: # for line in r: # print (line) # print ()
def corpus_query(self, query, save_file=None, window_size=300, surround_size=50): """Send query to a corpus's index. `save_file` is a filename. >>> cltk_index = CLTKIndex('latin', 'phi5') >>> results = cltk_index.corpus_query('amicitia') :type save_file: str """ _index = open_dir(self.index_path) output_str = '' with _index.searcher() as searcher: _query = QueryParser("content", _index.schema).parse(query) results = searcher.search(_query, limit=None) results.fragmenter.charlimit = None # Allow larger fragments results.fragmenter.maxchars = window_size # Show more context before and after results.fragmenter.surround = surround_size docs_number = searcher.doc_count_all() output_str += 'Docs containing hits: {}.'.format( docs_number) + '</br></br>' for hit in results: author = hit['author'] filepath = hit['path'] output_str += author + '</br>' output_str += filepath + '</br>' with open(filepath) as file_open: file_contents = file_open.read() highlights = hit.highlights("content", text=file_contents, top=10000000) lines = highlights.split('\n') #lines_numbers = [l for l in lines] lines_br = '</br>'.join(lines) lines_number_approx = len(lines) output_str += 'Approximate hits: {}.'.format( lines_number_approx) + '</br>' output_str += lines_br + '</br></br>' if save_file: user_dir = os.path.expanduser('~/cltk_data/user_data/search') output_path = os.path.join(user_dir, save_file + '.html') try: with open(output_path, 'w') as file_open: file_open.write(output_str) except FileNotFoundError: os.mkdir(user_dir) with open(output_path, 'w') as file_open: file_open.write(output_str) else: return output_str
def search(): query = request.form['query'] q = [] q.append(query) r = [] #complete path c = [] #preview of the paste content paste_date = [] paste_size = [] paste_tags = [] index_name = request.form['index_name'] num_elem_to_get = 50 # select correct index if index_name is None or index_name == "0": selected_index = get_current_index() else: selected_index = os.path.join(baseindexpath, index_name) ''' temporary disabled # # TODO: search by filename/item id ''' # Search full line schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) ix = index.open_dir(selected_index) with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse("".join(q)) results = searcher.search_page(query, 1, pagelen=num_elem_to_get) for x in results: r.append(x.items()[0][1].replace(PASTES_FOLDER, '', 1)) path = x.items()[0][1].replace(PASTES_FOLDER, '', 1) paste = Paste.Paste(path) content = paste.get_p_content() content_range = max_preview_char if len( content) > max_preview_char else len(content) - 1 c.append(content[0:content_range]) curr_date = str(paste._get_p_date()) curr_date = curr_date[0:4] + '/' + curr_date[ 4:6] + '/' + curr_date[6:] paste_date.append(curr_date) paste_size.append(paste._get_p_size()) p_tags = r_serv_metadata.smembers('tag:' + path) l_tags = [] for tag in p_tags: complete_tag = tag tag = tag.split('=') if len(tag) > 1: if tag[1] != '': tag = tag[1][1:-1] # no value else: tag = tag[0][1:-1] # use for custom tags else: tag = tag[0] l_tags.append((tag, complete_tag)) paste_tags.append(l_tags) results = searcher.search(query) num_res = len(results) index_list = get_index_list() index_min = 1 index_max = len(index_list) return render_template("search.html", r=r, c=c, query=request.form['query'], paste_date=paste_date, paste_size=paste_size, char_to_display=max_preview_modal, num_res=num_res, index_min=index_min, index_max=index_max, bootstrap_label=bootstrap_label, paste_tags=paste_tags, index_list=index_list)
def find_path(self, path): parser = QueryParser('path', self._index.schema) query = parser.parse(path) return self._search(query, limit=1)
writer.add_document( title=u"document2", path=u"/b", content=u"The second one 你 中文测试中文 is even more interesting! 吃水果") writer.add_document(title=u"document3", path=u"/c", content=u"买水果然后来世博园。") writer.add_document(title=u"document4", path=u"/c", content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作") writer.add_document(title=u"document4", path=u"/c", content=u"咱俩交换一下吧。") writer.commit() searcher = ix.searcher() parser = QueryParser("content", schema=ix.schema) for keyword in (u"水果世博园", u"你", u"first", u"中文", u"交换机", u"交换"): print "result of ", keyword q = parser.parse(keyword) results = searcher.search(q) for hit in results: print hit.highlights("content") print "=" * 10 words_train('movie.txt', 'movie_key.txt', 'movie.graph') cor = YahaCorrector('movie_key.txt', 'movie.graph') sugs = cor.suggest(u"刘牛德") print " ".join(sugs)
def find_text(self, text, pagenum=1, limit=10): parser = QueryParser('content', self._index.schema) query = parser.parse(text) return self._search(query, pagenum, limit)
sentences = sent_detector.tokenize(review) writer = ix.writer() for sentence in sentences: writer.add_document(content=unicode(sentence)) feature = agreement[1] if agreement[1] is not None else agreement[2] if(feature == "size limit"): print "stop" writer.commit() sentences_feature = "" with ix.searcher() as searcher: query = QueryParser("content", ix.schema, group=qparser.OrGroup).parse(unicode(feature)) results = searcher.search(query) if(len(results) == 0): sentences_feature = agreement[6] else: sentences_feature = results[0]["content"] # Prepare SQL query to INSERT a record into the database. sql = "UPDATE agreements SET sentence = (%s) WHERE id = (%s) " try: # Execute the SQL command cursor.execute(sql, (str(sentences_feature).encode('ascii','ignore'), agreement[5])) # Commit your changes in the database db.commit() except Exception, e: # Rollback in case there is any error
def main(): arg_parser = argparse.ArgumentParser( description='Perform a full-text search over stored telegram messages.', prog='tgminer-search') arg_parser.add_argument('--version', action='version', version='%(prog)s ' + tgminer.__version__) arg_parser.add_argument('query', help='Query text.') arg_parser.add_argument( '--config', help='Path to TGMiner config file, defaults to "CWD/config.json". ' 'This will override the environmental variable ' 'TGMINER_CONFIG if it was defined.') arg_parser.add_argument( '--limit', help='Results limit, 0 for infinite. Default is 10.', type=query_limit(arg_parser), default=10) arg_parser.add_argument( '--markov', help= 'Generate a markov chain file from the messages in your query results.', metavar='OUT_FILE') arg_parser.add_argument( '--markov-state-size', default=None, help= 'The number of words to use in the markov model\'s state, default is 2. ' 'Must be used in conjunction with --markov.', type=markov_state_size(arg_parser)) arg_parser.add_argument( '--markov-optimize', default=None, choices=('accuracy', 'size'), help='The default option "accuracy" produces a larger chain file where ' 'all trailing word/sequence probabilities are considered for every word in ' 'a message. This can result in a very large and slow to load chain if the ' 'state size is set to a high value. Setting this to "size" will cause ' 'trailing probabilities for the words inside the sequence that makes up a state ' 'to be discarded, except for the last word. This will make the chain smaller ' 'but results in more of an approximate model of the input messages.') args = arg_parser.parse_args() if args.markov_state_size is not None and args.markov is None: arg_parser.error( 'Must be using the --markov option to use --markov-state-size.') if args.markov_optimize is not None and args.markov is None: arg_parser.error( 'Must be using the --markov option to use --markov-optimize.') if args.markov_state_size is None: args.markov_state_size = 2 if args.markov_optimize is None: args.markov_optimize = 'accuracy' config = None # hush intellij highlighted undeclared variable use warning config_path = tgminer.config.get_config_path(args.config) if os.path.isfile(config_path): try: config = tgminer.config.TGMinerConfig(config_path) except tgminer.config.TGMinerConfigException as e: enc_print(str(e), file=sys.stderr) exit(exits.EX_CONFIG) else: enc_print(f'Cannot find tgminer config file: "{config_path}"') exit(exits.EX_NOINPUT) index = whoosh.index.open_dir(os.path.join(config.data_dir, 'indexdir')) index_lock_path = os.path.join(config.data_dir, 'tgminer_mutex') schema = tgminer.fulltext.LogSchema() query_parser = QueryParser('message', schema=schema) query = query_parser.parse(args.query) def result_iter(): with fasteners.InterProcessLock(index_lock_path): with index.searcher() as searcher: yield from searcher.search( query, limit=None if args.limit < 1 else args.limit, sortedby='timestamp') if args.markov: split_by_spaces = re.compile('\s+') chain = kovit.Chain() if args.markov_optimize == 'accuracy': word_iter = kovit.iters.iter_window else: word_iter = kovit.iters.iter_runs anything = False for hit in result_iter(): message = hit.get('message', None) if message: anything = True for start, next_items in word_iter( split_by_spaces.split(message), args.markov_state_size): chain.add_to_bag(start, next_items) if not anything: enc_print('Query returned no messages!', file=sys.stderr) exit(exits.EX_SOFTWARE) try: with open(args.markov, 'w', encoding='utf-8') as m_out: chain.dump_json(m_out) except OSError as e: enc_print( f'Could not write markov chain to file "{args.markov}", error: {e}', file=sys.stderr) exit(exits.EX_CANTCREAT) else: for hit in result_iter(): message = hit.get('message', None) username = hit.get('username', None) alias = hit.get('alias', 'NO_ALIAS') to_username = hit.get('to_username', None) to_alias = hit.get('to_alias', None) to_id = hit.get('to_id') username_part = f' [@{username}]' if username else '' timestamp = config.timestamp_format.format(hit['timestamp']) chat_slug = hit['chat'] media = hit.get('media', None) to_username_part = f' [@{to_username}]' if to_username else '' to_user_part = f' to {to_alias}{to_username_part}' if to_alias or to_username_part else '' if media: caption_part = f' Caption: {message}' if message else '' enc_print( f'{timestamp} chat="{chat_slug}" to_id="{to_id}"{to_user_part} | {alias}{username_part}: {media}{caption_part}' ) else: enc_print( f'{timestamp} chat="{chat_slug}" to_id="{to_id}"{to_user_part} | {alias}{username_part}: {hit["message"]}' )