def get(self, request, *args, **kwargs): if not ('q' in request.GET and 'dictionary' in request.GET): return Response() q = u"{0}".format(request.GET['q']) dictionary = u"{0}".format( request.GET['dictionary']) # The suggester to use manuscript = u"{0}".format( request.GET['manuscript'] ) # Can be '*' when searching through all manuscripts connection = solr.Solr(settings.SOLR_SERVER) search_handler = solr.SearchHandler(connection, "/suggest") # TODO fix solr so that the suggesters work with a context field (cfq)] # search_results = search_handler(q=q, suggest_dictionary=dictionary, suggest_cfq=manuscript) search_results = search_handler(q=q, suggest_dictionary=dictionary) results = search_results.suggest[dictionary][q] # Remove duplicates from the suggestions and limits the return number to 10 results['suggestions'] = self._get_filtered_results( results['suggestions']) response = Response(results) return response
def first_core_of_connection(solr_connection): response = solr.SearchHandler(solr_connection, "/admin/cores").raw(**{ 'action': 'STATUS', 'wt': 'json' }) results = json.loads(response) core = results['status'].keys().pop() return core
def query_math_docs(query_string, query_format): select = solr.SearchHandler(conn, "/select") request_string = "{!mathqueryparser} FORMAT(" + query_format.lower( ) + ")" + query_string + "&rows=20" print request_string response = select.__call__(request_string) if not response: print "No results" else: for hit in response: print hit['filename'] + ":" + hit['math_notational_field'] return response #add_math_file(1, "arx1312.6708.eq", "/share/math/arxiv_cds/arx1312.6708.eq")
return row.value if __name__ == "__main__": parser = argparse.ArgumentParser( description='Make csv report of indexed collections') parser.add_argument('auth_token', help='Authentication token') parser.add_argument('--solr_url', help='Solr index url') parser.add_argument('--couchdb_url', help='CouchDB url') args = parser.parse_args() solr_url = args.solr_url if args.solr_url else SOLR_URL print "SOLR_URL:{}".format(solr_url) SOLR = solr.SearchHandler( solr.Solr( solr_url, post_headers={ 'X-Authentication-Token': args.auth_token, }, ), "/query") if args.couchdb_url: cdb = get_couchdb(url_couchdb=couchdb_url, dbname='ucldc') else: cdb = get_couchdb(dbname='ucldc') collections = get_indexed_collection_list(SOLR) date_to_minute = datetime.datetime.now().strftime('%Y%m%d-%H%M') fname = 'indexed_collections-{}.csv'.format(date_to_minute) with open(fname, 'wb') as csvfile: csvwriter = UnicodeWriter(csvfile) csvwriter.writerow( ('Collection Name', 'Collection URL', 'Number in index', 'Number in couchdb', 'Number in OAC', 'Couch missing in solr',
from __future__ import print_function import solr import sys import re con = solr.Solr('http://localhost:8983/solr/wikipedia_core') mlt = solr.SearchHandler(con, "/mlt") THRESHOLD = 0.9 def process(line): # replace some known utf-8 chars with ascii line = re.sub("\xe2\x80\x99", "x", line) # U+2019 (right single quotation mark) line = re.sub("\xe2\x80\x93", "-", line) # U+2013 (EN-DASH) # remove the rest of the non-ascii chars line = re.sub(r'[^\x00-\x7F]+', ' ', line) r = mlt(mlt_fl='page_text', mlt_mindf=1, mlt_mintf=1, stream_body=line) return ",".join([ d['page_name'] for d in r.results if d['score'] / r.maxScore >= THRESHOLD ]).encode('utf-8') def main(): try: for line in sys.stdin: print(process(line.strip())) except:
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Solr utilities. """ import itertools from invenio.config import CFG_SOLR_URL from intbitset import intbitset from invenio.ext.logging import register_exception if CFG_SOLR_URL: import solr conn = solr.Solr(CFG_SOLR_URL) SOLR_CONNECTION = solr.SolrConnection(CFG_SOLR_URL) # pylint: disable=E1101 SOLR_MLT_CONNECTION = solr.SearchHandler(conn, "/mlt") BOOLEAN_EQUIVALENTS = {"+": "AND", "|": "OR", "-": "NOT"} def get_collection_filter(hitset, cutoff_amount): # The last n hitset records are considered to be newest and therfore most relevant start_index = len(hitset) - cutoff_amount if start_index < 0: start_index = 0 it = itertools.islice(hitset, start_index, None) ids = ' '.join([str(recid) for recid in it]) if ids: return 'id:(%s)' % ids else:
for word in counts.keys(): c[word] = int(counts[word]['tf']) return c s = solr.Solr('http://localhost:8983/solr') #response = s.select('*:*') #print response #pdb.set_trace() field_name = 'includes' tvrh = solr.SearchHandler(s, '/tvrh') response = tvrh.__call__(q='*:*', tv_tf='true', tv_fl=field_name, fl='none', rows=1000) for x in xrange(0, 1000): print x tv = response.termVectors #del tv['uniqueKeyFieldName'] response = response.next_batch() #print sum( [ get_tf_counter ( field_name, value) for value in response.termVectors.values() ], Counter() )
import solr import random import json import datetime SOLR_NEW = solr.Solr('http://127.0.0.1:8983/solr/dc-collection') SOLR=solr.SearchHandler(solr.Solr('https://registry.cdlib.org/solr', post_headers = { 'X-Authentication-Token':'xxxyyyzzz'}), "/query") def get_collection_urls(): q_collections=SOLR(q="*:*", rows=0, facet_field="collection_url", facet="true", facet_limit=20000) facets = q_collections.facet_counts f_fields = facets['facet_fields'] return f_fields['collection_url'] def get_random_docs(collection_urls): docs = [] for u in collection_urls: recs_in_coll = SOLR(q="collection_url:{}".format(u)) num = recs_in_coll.numFound sample_size = num / 100 if num / 100 else 1 print "NUMBER:{} SAMPLE:{}".format(num, sample_size) for i in range(sample_size): rand_index = random.randrange(num) q_rec = SOLR(q="collection_url:{}".format(u), rows=1, start=rand_index) #save locally doc_new = {} for key, val in q_rec.results[0].items(): if '_ss' in key: continue
if py_version.major == 2 and py_version.minor == 7 and py_version.micro > 8: #disable ssl verification import ssl ssl._create_default_https_context = ssl._create_unverified_context import os from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_by_collection import solr SOLR_URL = os.environ.get('URL_SOLR_API', None) SOLR_API_KEY = os.environ.get('SOLR_API_KEY', None) SOLR = solr.SearchHandler( solr.Solr( SOLR_URL, post_headers={ 'X-Authentication-Token': SOLR_API_KEY, }, ), "/query") def fill_object_values_from_solr(doc): '''If no object field, try to get from current solr ''' if 'object' not in doc: query = 'harvest_id_s:"{}"'.format(doc['_id']) msg = "NO OBJECT FOR {}".format(doc['_id']) resp = SOLR( q=query, fields= 'harvest_id_s, reference_image_md5, id, collection_url, reference_image_dimensions',
from django.http import HttpResponse from django.shortcuts import render_to_response import datetime import pprint import solr import logging logging.basicConfig(level=logging.INFO) solr_url = "http://localhost:8080/discovery" #solr_url = "http://192.168.45.10:8983/solr/aadda-discovery" s = solr.Solr(solr_url, debug=True) s.select = solr.SearchHandler(s, "/select","__") # Create your views here. def search(request): q = request.GET.get('q', "*") r = s.select( q, sort='sentiment_score', sort_order='asc', group='true', group__field='domain', group__main='true', facet='true', facet__sort='count', facet__field=['crawl_year', 'content_type', 'content_ffb'], facet__mincount=1, f__crawl_year__facet__mincount=0) print dir(r) return render_to_response('search_result_list.html', {'numFound': r, 'r': r, 'facets': r.facet_counts['facet_fields']})